# -*- coding: utf-8 -*- """ Created on Wed Jul 23 11:12:40 2014 @author: andylane Heald Lab, UC Berkeley Steps: 1) load in abi files 2) extract out the full sgRNA sequences: - first trim out everything from 3' pUC19 on - match on sgRNA region (~85% identity?) - pick out intervening 21mers 3) check results against existing target Version 2.1: - Remove code relating to digestion of amps144. May add back later. - Improve logical flow; v2.0 is convoluted; this is in preparation for adaptation to HiSeq data. To do: Where there are two tgts per sequencing rxn, this version counts both as non-end. This is incorrect; need to implement a tgt-unique code system. Version 2.0: - Matching on amps144 amplicons is using BLAST; this allows some error in sequencing etc. Next: x figure out how many of the sgRNA adjacent sequences are found using BLAST in Amps300: looks like 42 x Which are right at primer termini? looks like 25 :( x Replace Ns with stars... [not necessary; BLAST respects Ns] - update printfeat to print multiple features on a map... (include HpaII, BfaI, ScrFI). x How many are next to PAMs as they should be? 14/17, or 82%. Not bad! """

In [1]:
import os
import Bio 
import re
import timeit
import copy
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio import Restriction 
from Bio.Restriction import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SeqFeature
from Bio.SeqFeature import *
import pandas

In [2]:
# Load in the Sanger sequencing data.
lib5pr = []

for filename in os.listdir("abifiles/5prlib"):
    handle = open("abifiles/5prlib" + "/" + filename, 'rb')
    record = SeqIO.read(handle, "abi", alphabet=IUPACAmbiguousDNA())    
    lib5pr.append(record)

In [3]:
# Define the filter that delineates the end of a target 20mer. This uses the first 9nt of the sgRNA hairpin. 
# Results in 45 targets for 5 pr.
sgRNAconst = SeqRecord(Seq("GTTTAAGAG"))
sgRNAfiltlib5pr = copy.deepcopy(lib5pr)

# Makes a Feature for each of the marker 9nt sgRNA signatures in the Sanger sequencing files.
for seqrecord in sgRNAfiltlib5pr:
    fwdlocs = []  
    revlocs = []  
    fwdlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.seq), str(seqrecord.seq))]
    for item in fwdlocs:
        start = ExactPosition(int(item))
        end = ExactPosition(int((item) + len(sgRNAconst) + 1))
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type="sgRNAconst", strand = +1)
        seqrecord.features.append(feature)
    revlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.reverse_complement().seq), str(seqrecord.seq))]
    for item in revlocs:
        start = ExactPosition(int(item) - 1)
        end = ExactPosition(start + len(sgRNAconst))
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type="sgRNAconst", strand = -1)
        seqrecord.features.append(feature)

#Make a new list (SeqRecords) out of the 21mers before first 9nt of the sgRNA hairpins
alltgts = []
for seqrecord in sgRNAfiltlib5pr:
    for feat in seqrecord.features:
        if feat.strand == 1:
            tgtstart = int(feat.location.start) - 36 # -21
            tgtend = int(feat.location.start)
            sgtgt = seqrecord[tgtstart:tgtend]
            alltgts.append(sgtgt)
            #print "pos \n \n"
        if feat.strand == -1:
            tgtend = int(feat.location.end) + 36 # +21
            tgtstart = int(feat.location.end)
            sgtgt = seqrecord[tgtstart:tgtend].reverse_complement()
            sgtgt.name=seqrecord.name
            alltgts.append(sgtgt)

In [4]:
#Can these be accurately trimmed to right after the oligo-derived T7 promoter sequence? gaaatTAATACGACTCACTATAG
# Use CGACTCACTATAG

In [5]:
## NEXT UP: USE THESE CORRECTLY TRIMMED 20MERS; SEE HOW LONG THEY ARE ETC.
a = []
for item in alltgts:
    try:
        l = [tloc.end() for tloc in re.finditer("ACTCACTATAG", str(item.seq))]
        #print l
        item = item[int(l[0]):]
        #print item.seq
        a.append(item)
    except:
        l=0
alltgts = copy.deepcopy(a)

In [6]:
for item in a:
    print item.seq


ACCATGCTACTTGAGGGCATT
GAGCTGGCATTGGCCTGTCC
NCATGCATTAAAGCCATTGCC
CTAATGGTCTGGTTGGCTGGTG
GCGTANGNNNNTNNNANNNNNG
AGCTAATATTTTGGGGGCCCC
CCATCCATATGCAACTCATGA
NNNNNNNNNNNNNTTNNNANNNCC
ATGGCCACTACAGGGACCCC
CTGTGCATGGGAGCCAGATCTG
TACAGCACGGCTACAATGGAG
TAGGAGGCACTTTGAAGAGCAG
GGGTGAAGGACAGCCCAATTAG
TCCATATCTGAGTAGTGGGCCG
CTACAGCACGGCTACAATGGAG
CCACAAAAGGGGATCTGAACCG
CCATGCTACTTGAGGGCATTG
ATATAAAAAAGTGAAGGACCTG
ACACAGAATAAGGTCACTTACG
NANGAGGCACTTTGAAGAGCA
GTCTTACCAAGGCACATGTGG
CTCAAGTAAGGCATTCCTGA
GCCCCATGGGAGTGACAGCAC
CTATGTGNNNCNNGNANNNCC
CTGTGCATGGGAGCCAGATCT
GTTGCAGGTGTGGGAGAGCCG
TGCTCCTTGTCTTTCCCTCCG
TTAACGACGGCAGTAGCTTCCG
GCAGGGTCTCCTCTTCAATT
CTGTGTCACGGTCTTGTCTCC
TAACGACGGCAGTAGCTTCCG
CACTTGCCCTAATTAAATCCG
TTTTACCAAGGCATCAGGTGAG
NNNNNNNACAGATTCNNNNNG
GTTCAGAAAGCAACAACCCC
ANNNNNNNTANNNNNNANTANNGN
CAGAAGTCACAAGTGAACCC
GCGTTGGCGAGTCTCTGATTA
AGGACAACAGGGGGAGCCCC
GCCTGTATGTGAACATCAGC
ACCATGCTACTTGAGGGCATTG
AGAAGGTGGTATGCTGATGGG

In [7]:
for item in a:
    print len(item)


21
20
21
22
22
21
21
24
20
22
21
22
22
22
22
22
21
22
22
21
21
20
21
21
21
21
21
22
20
21
21
21
22
21
20
24
20
21
20
20
22
21

In [8]:
b = [len(i) for i in a]
b.sort()
[item for item in enumerate(b)]


Out[8]:
[(0, 20),
 (1, 20),
 (2, 20),
 (3, 20),
 (4, 20),
 (5, 20),
 (6, 20),
 (7, 20),
 (8, 21),
 (9, 21),
 (10, 21),
 (11, 21),
 (12, 21),
 (13, 21),
 (14, 21),
 (15, 21),
 (16, 21),
 (17, 21),
 (18, 21),
 (19, 21),
 (20, 21),
 (21, 21),
 (22, 21),
 (23, 21),
 (24, 21),
 (25, 21),
 (26, 21),
 (27, 22),
 (28, 22),
 (29, 22),
 (30, 22),
 (31, 22),
 (32, 22),
 (33, 22),
 (34, 22),
 (35, 22),
 (36, 22),
 (37, 22),
 (38, 22),
 (39, 22),
 (40, 24),
 (41, 24)]

In [9]:
# From the new SeqRecords list of 21mers, Search against amps300; get hits:
allamps = []
#for item in SeqIO.parse("amps144masked_iter0.fasta", "fasta"):
    #allamps.append(item)

for item in SeqIO.parse("amps144.fasta", "fasta"):
    allamps.append(item)
    
for item in SeqIO.parse("theextraamps.fasta", "fasta"):
    allamps.append(item)

Next: re-generate that list of cut guides that you used to BLAST before. Are the hits (within amps300) actually within the expected PAM-adjacent cut sites?

This is somewhat of a reboot: starting way back from the 21mers after T7 in sequencing data, BLAST against a newly made Amps144 db. The goal is to get the match locations and visualize them.


In [10]:
Bio.SeqIO.write(alltgts, "alltgtstemp.fa", "fasta")
blastn_cline = NcbiblastnCommandline(query="alltgtstemp.fa", db="amps144", \
task = "blastn-short",outfmt=5, out="alltgts.blast", max_target_seqs=100, num_threads = 7, evalue = 0.005)
timeit.timeit(blastn_cline, number =1)


Out[10]:
1.4703609943389893

In [11]:
result_handle = open("alltgts.blast")
blast_records = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here

blast_records_list = []
for blast_record in blast_records:
    blast_records_list.append(blast_record)
result_handle.close()

In [12]:
blastsandrecords = []
for i,j in enumerate(alltgts):
    blastsandrecords.append((j, blast_records_list[i]))

In [12]:


In [13]:
i = 0
for item in blastsandrecords:
    print(item[0].name + " " + item[0].seq) # Print out the query seq and its title, basically
    for alignment in [item[1]]:
        for item in alignment.alignments: 
            print item.title # Print out each Amp of amps144+extra amps that a match is made on
            i= i+1
            for hit in item.hsps:
                print("--" + str(hit)) # Within each hit amp, print out each specific hit sequence
    print "\n"

print i


02-5-1_B01_013 ACCATGCTACTTGAGGGCATT
gnl|BL_ORD_ID|81 986031 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 ACCATGCTACTTGAGGGCATT 21
               |||||||||||||||||||||
Sbjct:    3381 ACCATGCTACTTGAGGGCATT 3361


02-5-1_B01_013 GAGCTGGCATTGGCCTGTCC
gnl|BL_ORD_ID|54 6229761 <unknown description>
--Score 16 (32 bits), expectation 3.5e-03, alignment length 20
Query:       1 GAGCTGGCATTGGCCTGTCC 20
               |||||||||||| |||||||
Sbjct:    1275 GAGCTGGCATTGACCTGTCC 1256


04-5-3_D01_009 NCATGCATTAAAGCCATTGCC
gnl|BL_ORD_ID|171 3385262 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       2 CATGCATTAAAGCCATTGCC 21
               ||||||||||||||||||||
Sbjct:    1093 CATGCATTAAAGCCATTGCC 1074


09-5-4_A02_016 CTAATGGTCTGGTTGGCTGGTG
gnl|BL_ORD_ID|104 2901440 <unknown description>
--Score 21 (42 bits), expectation 4.5e-06, alignment length 21
Query:       1 CTAATGGTCTGGTTGGCTGGT 21
               |||||||||||||||||||||
Sbjct:     747 CTAATGGTCTGGTTGGCTGGT 727


10-5-5_B02_014 GCGTANGNNNNTNNNANNNNNG


11-5-6_C02_012 AGCTAATATTTTGGGGGCCCC
gnl|BL_ORD_ID|143 3110360 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 AGCTAATATTTTGGGGGCCCC 21
               |||||||||||||||||||||
Sbjct:    2628 AGCTAATATTTTGGGGGCCCC 2608


12-5-7_D02_010 CCATCCATATGCAACTCATGA
gnl|BL_ORD_ID|171 3385262 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 CCATCCATATGCAACTCATGA 21
               |||||||||||||||||||||
Sbjct:     147 CCATCCATATGCAACTCATGA 127


17-5-8_A03_031 NNNNNNNNNNNNNTTNNNANNNCC


17-5-8_A03_031 ATGGCCACTACAGGGACCCC
gnl|BL_ORD_ID|161 9981706 <unknown description>
--Score 20 (40 bits), expectation 1.4e-05, alignment length 20
Query:       1 ATGGCCACTACAGGGACCCC 20
               ||||||||||||||||||||
Sbjct:    4432 ATGGCCACTACAGGGACCCC 4413


18-5-9_B03_029 CTGTGCATGGGAGCCAGATCTG
gnl|BL_ORD_ID|312 3476947 18
--Score 22 (44 bits), expectation 1.1e-06, alignment length 22
Query:       1 CTGTGCATGGGAGCCAGATCTG 22
               ||||||||||||||||||||||
Sbjct:    1427 CTGTGCATGGGAGCCAGATCTG 1448
gnl|BL_ORD_ID|10 3476947 <unknown description>
--Score 22 (44 bits), expectation 1.1e-06, alignment length 22
Query:       1 CTGTGCATGGGAGCCAGATCTG 22
               ||||||||||||||||||||||
Sbjct:    1427 CTGTGCATGGGAGCCAGATCTG 1448


19-5-10_C03_027 TACAGCACGGCTACAATGGAG
gnl|BL_ORD_ID|3 131171 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 TACAGCACGGCTACAATGGAG 21
               |||||||||||||||||||||
Sbjct:   10583 TACAGCACGGCTACAATGGAG 10563


20-5-11_D03_025 TAGGAGGCACTTTGAAGAGCAG
gnl|BL_ORD_ID|333 8426516 20
--Score 22 (44 bits), expectation 1.1e-06, alignment length 22
Query:       1 TAGGAGGCACTTTGAAGAGCAG 22
               ||||||||||||||||||||||
Sbjct:    1692 TAGGAGGCACTTTGAAGAGCAG 1713
gnl|BL_ORD_ID|283 8424649 <unknown description>
--Score 22 (44 bits), expectation 1.1e-06, alignment length 22
Query:       1 TAGGAGGCACTTTGAAGAGCAG 22
               ||||||||||||||||||||||
Sbjct:    3559 TAGGAGGCACTTTGAAGAGCAG 3580


20-5-11_D03_025 GGGTGAAGGACAGCCCAATTAG
gnl|BL_ORD_ID|59 8480056 <unknown description>
--Score 21 (42 bits), expectation 4.5e-06, alignment length 21
Query:       1 GGGTGAAGGACAGCCCAATTA 21
               |||||||||||||||||||||
Sbjct:    3854 GGGTGAAGGACAGCCCAATTA 3874


25-5-12_A04_032 TCCATATCTGAGTAGTGGGCCG
gnl|BL_ORD_ID|143 3110360 <unknown description>
--Score 21 (42 bits), expectation 4.5e-06, alignment length 21
Query:       1 TCCATATCTGAGTAGTGGGCC 21
               |||||||||||||||||||||
Sbjct:    3048 TCCATATCTGAGTAGTGGGCC 3028


25-5-12_A04_032 CTACAGCACGGCTACAATGGAG
gnl|BL_ORD_ID|3 131171 <unknown description>
--Score 22 (44 bits), expectation 1.1e-06, alignment length 22
Query:       1 CTACAGCACGGCTACAATGGAG 22
               ||||||||||||||||||||||
Sbjct:   10584 CTACAGCACGGCTACAATGGAG 10563


26-5-13_B04_030 CCACAAAAGGGGATCTGAACCG
gnl|BL_ORD_ID|3 131171 <unknown description>
--Score 21 (42 bits), expectation 4.5e-06, alignment length 21
Query:       1 CCACAAAAGGGGATCTGAACC 21
               |||||||||||||||||||||
Sbjct:   10985 CCACAAAAGGGGATCTGAACC 11005


27-5-14_C04_028 CCATGCTACTTGAGGGCATTG
gnl|BL_ORD_ID|81 986031 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 CCATGCTACTTGAGGGCATT 20
               ||||||||||||||||||||
Sbjct:    3380 CCATGCTACTTGAGGGCATT 3361


28-5-15_D04_026 ATATAAAAAAGTGAAGGACCTG
gnl|BL_ORD_ID|250 9278465 <unknown description>
--Score 22 (44 bits), expectation 1.1e-06, alignment length 22
Query:       1 ATATAAAAAAGTGAAGGACCTG 22
               ||||||||||||||||||||||
Sbjct:     226 ATATAAAAAAGTGAAGGACCTG 205


35-5-18_C05_043 ACACAGAATAAGGTCACTTACG
gnl|BL_ORD_ID|33 7685624 <unknown description>
--Score 17 (34 bits), expectation 1.1e-03, alignment length 21
Query:       1 ACACAGAATAAGGTCACTTAC 21
               ||||||||||| |||||||||
Sbjct:    1501 ACACAGAATAACGTCACTTAC 1521


36-5-19_D05_041 NANGAGGCACTTTGAAGAGCA
gnl|BL_ORD_ID|333 8426516 20
--Score 18 (36 bits), expectation 2.5e-04, alignment length 18
Query:       4 GAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||
Sbjct:    1695 GAGGCACTTTGAAGAGCA 1712
gnl|BL_ORD_ID|283 8424649 <unknown description>
--Score 18 (36 bits), expectation 2.5e-04, alignment length 18
Query:       4 GAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||
Sbjct:    3562 GAGGCACTTTGAAGAGCA 3579


41-5-20_A06_048 GTCTTACCAAGGCACATGTGG
gnl|BL_ORD_ID|154 270146 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 GTCTTACCAAGGCACATGTGG 21
               |||||||||||||||||||||
Sbjct:     136 GTCTTACCAAGGCACATGTGG 116


41-5-20_A06_048 CTCAAGTAAGGCATTCCTGA
gnl|BL_ORD_ID|41 9593441 <unknown description>
--Score 20 (40 bits), expectation 1.4e-05, alignment length 20
Query:       1 CTCAAGTAAGGCATTCCTGA 20
               ||||||||||||||||||||
Sbjct:    5151 CTCAAGTAAGGCATTCCTGA 5132


43-5-22_C06_044 GCCCCATGGGAGTGACAGCAC
gnl|BL_ORD_ID|104 2901440 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 GCCCCATGGGAGTGACAGCAC 21
               |||||||||||||||||||||
Sbjct:    6504 GCCCCATGGGAGTGACAGCAC 6484


44-5-23_D06_042 CTATGTGNNNCNNGNANNNCC


50-5-25_B07_061 CTGTGCATGGGAGCCAGATCT
gnl|BL_ORD_ID|312 3476947 18
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 CTGTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||||
Sbjct:    1427 CTGTGCATGGGAGCCAGATCT 1447
gnl|BL_ORD_ID|10 3476947 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 CTGTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||||
Sbjct:    1427 CTGTGCATGGGAGCCAGATCT 1447


51-5-26_C07_059 GTTGCAGGTGTGGGAGAGCCG
gnl|BL_ORD_ID|17 7562871 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 GTTGCAGGTGTGGGAGAGCC 20
               ||||||||||||||||||||
Sbjct:    7569 GTTGCAGGTGTGGGAGAGCC 7550


52-5-27_D07_057 TGCTCCTTGTCTTTCCCTCCG
gnl|BL_ORD_ID|312 3476947 18
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 TGCTCCTTGTCTTTCCCTCC 20
               ||||||||||||||||||||
Sbjct:     703 TGCTCCTTGTCTTTCCCTCC 722
gnl|BL_ORD_ID|10 3476947 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 TGCTCCTTGTCTTTCCCTCC 20
               ||||||||||||||||||||
Sbjct:     703 TGCTCCTTGTCTTTCCCTCC 722


52-5-27_D07_057 TTAACGACGGCAGTAGCTTCCG
gnl|BL_ORD_ID|277 4995367 <unknown description>
--Score 21 (42 bits), expectation 4.5e-06, alignment length 21
Query:       1 TTAACGACGGCAGTAGCTTCC 21
               |||||||||||||||||||||
Sbjct:      89 TTAACGACGGCAGTAGCTTCC 69


57-5-28_A08_064 GCAGGGTCTCCTCTTCAATT
gnl|BL_ORD_ID|13 5085511 <unknown description>
--Score 20 (40 bits), expectation 1.4e-05, alignment length 20
Query:       1 GCAGGGTCTCCTCTTCAATT 20
               ||||||||||||||||||||
Sbjct:    4050 GCAGGGTCTCCTCTTCAATT 4069


58-5-29_B08_062 CTGTGTCACGGTCTTGTCTCC
gnl|BL_ORD_ID|3 131171 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 CTGTGTCACGGTCTTGTCTCC 21
               |||||||||||||||||||||
Sbjct:   12262 CTGTGTCACGGTCTTGTCTCC 12282


60-5-31_D08_058 TAACGACGGCAGTAGCTTCCG
gnl|BL_ORD_ID|277 4995367 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 TAACGACGGCAGTAGCTTCC 20
               ||||||||||||||||||||
Sbjct:      88 TAACGACGGCAGTAGCTTCC 69


65-5-32_A09_079 CACTTGCCCTAATTAAATCCG
gnl|BL_ORD_ID|161 9981706 <unknown description>
--Score 20 (40 bits), expectation 1.6e-05, alignment length 20
Query:       1 CACTTGCCCTAATTAAATCC 20
               ||||||||||||||||||||
Sbjct:    4120 CACTTGCCCTAATTAAATCC 4101


67-5-34_C09_075 TTTTACCAAGGCATCAGGTGAG
gnl|BL_ORD_ID|176 1639848 <unknown description>
--Score 22 (44 bits), expectation 1.1e-06, alignment length 22
Query:       1 TTTTACCAAGGCATCAGGTGAG 22
               ||||||||||||||||||||||
Sbjct:    3335 TTTTACCAAGGCATCAGGTGAG 3356


67-5-34_C09_075 NNNNNNNACAGATTCNNNNNG


68-5-35_D09_073 GTTCAGAAAGCAACAACCCC
gnl|BL_ORD_ID|81 986031 <unknown description>
--Score 20 (40 bits), expectation 1.4e-05, alignment length 20
Query:       1 GTTCAGAAAGCAACAACCCC 20
               ||||||||||||||||||||
Sbjct:    5415 GTTCAGAAAGCAACAACCCC 5396


73-5-36_A10_080 ANNNNNNNTANNNNNNANTANNGN


76-5-39_D10_074 CAGAAGTCACAAGTGAACCC
gnl|BL_ORD_ID|192 895665 <unknown description>
--Score 20 (40 bits), expectation 1.4e-05, alignment length 20
Query:       1 CAGAAGTCACAAGTGAACCC 20
               ||||||||||||||||||||
Sbjct:     913 CAGAAGTCACAAGTGAACCC 894


81-5-40_A11_095 GCGTTGGCGAGTCTCTGATTA
gnl|BL_ORD_ID|54 6229761 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 GCGTTGGCGAGTCTCTGATTA 21
               |||||||||||||||||||||
Sbjct:    2616 GCGTTGGCGAGTCTCTGATTA 2596


82-5-41_B11_093 AGGACAACAGGGGGAGCCCC
gnl|BL_ORD_ID|12 5570943 <unknown description>
--Score 20 (40 bits), expectation 1.4e-05, alignment length 20
Query:       1 AGGACAACAGGGGGAGCCCC 20
               ||||||||||||||||||||
Sbjct:    2484 AGGACAACAGGGGGAGCCCC 2465


83-5-42_C11_091 GCCTGTATGTGAACATCAGC
gnl|BL_ORD_ID|171 3385262 <unknown description>
--Score 20 (40 bits), expectation 1.4e-05, alignment length 20
Query:       1 GCCTGTATGTGAACATCAGC 20
               ||||||||||||||||||||
Sbjct:     844 GCCTGTATGTGAACATCAGC 825


89-5-44_A12_096 ACCATGCTACTTGAGGGCATTG
gnl|BL_ORD_ID|81 986031 <unknown description>
--Score 21 (42 bits), expectation 4.5e-06, alignment length 21
Query:       1 ACCATGCTACTTGAGGGCATT 21
               |||||||||||||||||||||
Sbjct:    3381 ACCATGCTACTTGAGGGCATT 3361


90-5-45_B12_094 AGAAGGTGGTATGCTGATGGG
gnl|BL_ORD_ID|139 4286355 <unknown description>
--Score 21 (42 bits), expectation 4.0e-06, alignment length 21
Query:       1 AGAAGGTGGTATGCTGATGGG 21
               |||||||||||||||||||||
Sbjct:    1662 AGAAGGTGGTATGCTGATGGG 1682


42

In [14]:
#blastsandrecords[0][1].alignments[0].hsps[0].sbjct_start
#blastsandrecords[0][1].alignments[0].hsps[0].sbjct_end
#blastsandrecords[3][1].alignments[0].title.split()[1] # splits the generated BLAST alignment title on \
                                                      # whitespace; extracts the second element which \
                                                      # corresponds to the position on the scaffold

In [14]:


In [15]:
mappable = []
for tgt, blast in blastsandrecords:
    for i, j in enumerate(blast.alignments):
        try:
            pcr = blast.alignments[i].title.split()[1]
            start = blast.alignments[i].hsps[0].sbjct_start
            end = blast.alignments[i].hsps[0].sbjct_end
            match = blast.alignments[i].hsps[0].match
            query = blast.alignments[i].hsps[0].query
            print blast.alignments[i].title
            print pcr
            print i
            print(" " * (blast.alignments[i].hsps[0].query_start - 1) + query)
            print (" " * (blast.alignments[i].hsps[0].query_start - 1) + match)
            print tgt.seq + "\n"
            print 
            mapstring = (pcr, start, end)
            mappable.append((tgt, mapstring, match))
        except:
            pcr, start, end = 0,0,0


gnl|BL_ORD_ID|81 986031 <unknown description>
986031
0
ACCATGCTACTTGAGGGCATT
|||||||||||||||||||||
ACCATGCTACTTGAGGGCATT


gnl|BL_ORD_ID|54 6229761 <unknown description>
6229761
0
GAGCTGGCATTGGCCTGTCC
|||||||||||| |||||||
GAGCTGGCATTGGCCTGTCC


gnl|BL_ORD_ID|171 3385262 <unknown description>
3385262
0
 CATGCATTAAAGCCATTGCC
 ||||||||||||||||||||
NCATGCATTAAAGCCATTGCC


gnl|BL_ORD_ID|104 2901440 <unknown description>
2901440
0
CTAATGGTCTGGTTGGCTGGT
|||||||||||||||||||||
CTAATGGTCTGGTTGGCTGGTG


gnl|BL_ORD_ID|143 3110360 <unknown description>
3110360
0
AGCTAATATTTTGGGGGCCCC
|||||||||||||||||||||
AGCTAATATTTTGGGGGCCCC


gnl|BL_ORD_ID|171 3385262 <unknown description>
3385262
0
CCATCCATATGCAACTCATGA
|||||||||||||||||||||
CCATCCATATGCAACTCATGA


gnl|BL_ORD_ID|161 9981706 <unknown description>
9981706
0
ATGGCCACTACAGGGACCCC
||||||||||||||||||||
ATGGCCACTACAGGGACCCC


gnl|BL_ORD_ID|312 3476947 18
3476947
0
CTGTGCATGGGAGCCAGATCTG
||||||||||||||||||||||
CTGTGCATGGGAGCCAGATCTG


gnl|BL_ORD_ID|10 3476947 <unknown description>
3476947
1
CTGTGCATGGGAGCCAGATCTG
||||||||||||||||||||||
CTGTGCATGGGAGCCAGATCTG


gnl|BL_ORD_ID|3 131171 <unknown description>
131171
0
TACAGCACGGCTACAATGGAG
|||||||||||||||||||||
TACAGCACGGCTACAATGGAG


gnl|BL_ORD_ID|333 8426516 20
8426516
0
TAGGAGGCACTTTGAAGAGCAG
||||||||||||||||||||||
TAGGAGGCACTTTGAAGAGCAG


gnl|BL_ORD_ID|283 8424649 <unknown description>
8424649
1
TAGGAGGCACTTTGAAGAGCAG
||||||||||||||||||||||
TAGGAGGCACTTTGAAGAGCAG


gnl|BL_ORD_ID|59 8480056 <unknown description>
8480056
0
GGGTGAAGGACAGCCCAATTA
|||||||||||||||||||||
GGGTGAAGGACAGCCCAATTAG


gnl|BL_ORD_ID|143 3110360 <unknown description>
3110360
0
TCCATATCTGAGTAGTGGGCC
|||||||||||||||||||||
TCCATATCTGAGTAGTGGGCCG


gnl|BL_ORD_ID|3 131171 <unknown description>
131171
0
CTACAGCACGGCTACAATGGAG
||||||||||||||||||||||
CTACAGCACGGCTACAATGGAG


gnl|BL_ORD_ID|3 131171 <unknown description>
131171
0
CCACAAAAGGGGATCTGAACC
|||||||||||||||||||||
CCACAAAAGGGGATCTGAACCG


gnl|BL_ORD_ID|81 986031 <unknown description>
986031
0
CCATGCTACTTGAGGGCATT
||||||||||||||||||||
CCATGCTACTTGAGGGCATTG


gnl|BL_ORD_ID|250 9278465 <unknown description>
9278465
0
ATATAAAAAAGTGAAGGACCTG
||||||||||||||||||||||
ATATAAAAAAGTGAAGGACCTG


gnl|BL_ORD_ID|33 7685624 <unknown description>
7685624
0
ACACAGAATAAGGTCACTTAC
||||||||||| |||||||||
ACACAGAATAAGGTCACTTACG


gnl|BL_ORD_ID|333 8426516 20
8426516
0
   GAGGCACTTTGAAGAGCA
   ||||||||||||||||||
NANGAGGCACTTTGAAGAGCA


gnl|BL_ORD_ID|283 8424649 <unknown description>
8424649
1
   GAGGCACTTTGAAGAGCA
   ||||||||||||||||||
NANGAGGCACTTTGAAGAGCA


gnl|BL_ORD_ID|154 270146 <unknown description>
270146
0
GTCTTACCAAGGCACATGTGG
|||||||||||||||||||||
GTCTTACCAAGGCACATGTGG


gnl|BL_ORD_ID|41 9593441 <unknown description>
9593441
0
CTCAAGTAAGGCATTCCTGA
||||||||||||||||||||
CTCAAGTAAGGCATTCCTGA


gnl|BL_ORD_ID|104 2901440 <unknown description>
2901440
0
GCCCCATGGGAGTGACAGCAC
|||||||||||||||||||||
GCCCCATGGGAGTGACAGCAC


gnl|BL_ORD_ID|312 3476947 18
3476947
0
CTGTGCATGGGAGCCAGATCT
|||||||||||||||||||||
CTGTGCATGGGAGCCAGATCT


gnl|BL_ORD_ID|10 3476947 <unknown description>
3476947
1
CTGTGCATGGGAGCCAGATCT
|||||||||||||||||||||
CTGTGCATGGGAGCCAGATCT


gnl|BL_ORD_ID|17 7562871 <unknown description>
7562871
0
GTTGCAGGTGTGGGAGAGCC
||||||||||||||||||||
GTTGCAGGTGTGGGAGAGCCG


gnl|BL_ORD_ID|312 3476947 18
3476947
0
TGCTCCTTGTCTTTCCCTCC
||||||||||||||||||||
TGCTCCTTGTCTTTCCCTCCG


gnl|BL_ORD_ID|10 3476947 <unknown description>
3476947
1
TGCTCCTTGTCTTTCCCTCC
||||||||||||||||||||
TGCTCCTTGTCTTTCCCTCCG


gnl|BL_ORD_ID|277 4995367 <unknown description>
4995367
0
TTAACGACGGCAGTAGCTTCC
|||||||||||||||||||||
TTAACGACGGCAGTAGCTTCCG


gnl|BL_ORD_ID|13 5085511 <unknown description>
5085511
0
GCAGGGTCTCCTCTTCAATT
||||||||||||||||||||
GCAGGGTCTCCTCTTCAATT


gnl|BL_ORD_ID|3 131171 <unknown description>
131171
0
CTGTGTCACGGTCTTGTCTCC
|||||||||||||||||||||
CTGTGTCACGGTCTTGTCTCC


gnl|BL_ORD_ID|277 4995367 <unknown description>
4995367
0
TAACGACGGCAGTAGCTTCC
||||||||||||||||||||
TAACGACGGCAGTAGCTTCCG


gnl|BL_ORD_ID|161 9981706 <unknown description>
9981706
0
CACTTGCCCTAATTAAATCC
||||||||||||||||||||
CACTTGCCCTAATTAAATCCG


gnl|BL_ORD_ID|176 1639848 <unknown description>
1639848
0
TTTTACCAAGGCATCAGGTGAG
||||||||||||||||||||||
TTTTACCAAGGCATCAGGTGAG


gnl|BL_ORD_ID|81 986031 <unknown description>
986031
0
GTTCAGAAAGCAACAACCCC
||||||||||||||||||||
GTTCAGAAAGCAACAACCCC


gnl|BL_ORD_ID|192 895665 <unknown description>
895665
0
CAGAAGTCACAAGTGAACCC
||||||||||||||||||||
CAGAAGTCACAAGTGAACCC


gnl|BL_ORD_ID|54 6229761 <unknown description>
6229761
0
GCGTTGGCGAGTCTCTGATTA
|||||||||||||||||||||
GCGTTGGCGAGTCTCTGATTA


gnl|BL_ORD_ID|12 5570943 <unknown description>
5570943
0
AGGACAACAGGGGGAGCCCC
||||||||||||||||||||
AGGACAACAGGGGGAGCCCC


gnl|BL_ORD_ID|171 3385262 <unknown description>
3385262
0
GCCTGTATGTGAACATCAGC
||||||||||||||||||||
GCCTGTATGTGAACATCAGC


gnl|BL_ORD_ID|81 986031 <unknown description>
986031
0
ACCATGCTACTTGAGGGCATT
|||||||||||||||||||||
ACCATGCTACTTGAGGGCATTG


gnl|BL_ORD_ID|139 4286355 <unknown description>
4286355
0
AGAAGGTGGTATGCTGATGGG
|||||||||||||||||||||
AGAAGGTGGTATGCTGATGGG



In [16]:
len(mappable)


Out[16]:
42

In [17]:
ampsdict = {}
for item in allamps:
    ampsdict[item.id] = item.seq

Problem: not all BLAST hit titles (sequences) seem to map to amps300 dict entries... This was solved by changing the amps300 list that is being used. (But the individual amps are not the short versions you'd want...). Perhaps this should be revisited...


In [18]:
def printloc(queryseq, ampsdict): #lib5pr is subjectseq; t7 is queryseq
    '''
    This function accepts a query seq and a dictionary of subjectseqs, where the key (amp)
    is contained in a field in queryseq, highlighting the location of queryseq in it. 
    Returns a string.
    '''
    subjectseq = SeqRecord(ampsdict[queryseq[1][0]])
    #for seqrecord in subjectseq:
    locstart = queryseq[1][1]
    #print queryseq
    locend = queryseq[1][2]
    fwdlocs = []  
    revlocs = []
    # Figure out which strand the BLAST hit is on
    if locstart <= locend:
        fwdlocs.append(locstart)
    if locstart > locend:
        revlocs.append(locend)
    
    for item in fwdlocs:
        start = ExactPosition(int(item))
        end = ExactPosition(int((item) + len(queryseq[0].seq) + 1))
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type=str("cutsite_fwd"), strand = +1)
        subjectseq.features.append(feature)

    for item in revlocs:
        start = ExactPosition(int(item) - 2)
        end = ExactPosition(start + len(queryseq[0].seq) -1)
        location = FeatureLocation(start, end)
        feature = SeqFeature(location,type=str("cutsite_rev"), strand = -1)
        subjectseq.features.append(feature)
    #print subjectseq.features

    mask = list((("-" * 9) + "^" )* int(round(len(subjectseq.seq)/10.0)))
    for feature in subjectseq.features:
        featstart = int(feature.location.start)
        featend = int(feature.location.end)
        if feature.strand == 1:
            mask = mask[:featstart] + [">"] * int(featend-1 - featstart) + mask[featend-1:]
            #context = subjectseq[featstart+1:featend+4]
            context = subjectseq[featstart-10:featend+10]
        if feature.strand == -1:
            mask = mask[:featstart+1] + ["<"] * int(featend+1 - featstart) + mask[featend+1:]
            #context = subjectseq[featstart-2:featend+2]
            context = subjectseq[featstart-10:featend+10]
    mask = "".join(mask)
    # Add labels
    masklab = list(" " * (len(subjectseq.seq)))
    for feature in subjectseq.features:
        featstart = int(feature.location.start)
        featend = int(feature.location.end)
        featname = str(feature.type)
        masklab = masklab[:featstart] + list(str(featname)) + list(" " * (featend-1 - featstart - len(featname))) + masklab[featend-1:]
    masklab = "".join(masklab)
    #print subjectseq.name
    lines = int(round(len(subjectseq.seq) / 100))
    i = 0
    fullstring = []
    # Draw out the map, with three lines: subject seq, a marker/counter line with chevrons over features, then a 
    # feature label
    while i <= lines:
        indexstart = i*100
        indexend = (i+1) * 100
        if indexend > len(subjectseq.seq):
            indexend = len(subjectseq.seq)
        outstring = list(str(indexstart+1) + "  " + subjectseq.seq[indexstart:indexend] + "   " + str(indexend) + "\n" + \
        str(indexstart + 1) + "  " + mask[indexstart:indexend] + "   " + str(indexend) + "\n" + \
        str(indexstart +1) + "  " + masklab[indexstart:indexend] + "   " + str(indexend) + "\n")
        i = i + 1
        fullstring.extend(outstring)
    fullstring = "".join(fullstring)
    return (fullstring, context, subjectseq, fwdlocs, start, end, feature)

In [19]:
#t = printloc(mappable[3], ampsdict[mappable[3][1][0]])
t = printloc(mappable[6], ampsdict)

In [20]:
t[1]


Out[20]:
SeqRecord(seq=Seq('AAATTTAACCAGGGGTCCCTGTAGTGGCCATCTCCCCTC', SingleLetterAlphabet()), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [21]:
len(mappable)


Out[21]:
42

next up: trim the sequence files properly based on being followed by hairpin start. and include a map of the expected cut sites in that amplicon.


In [22]:
mappable[1]


Out[22]:
(SeqRecord(seq=Seq('GAGCTGGCATTGGCCTGTCC', IUPACAmbiguousDNA()), id='02-5-1', name='02-5-1_B01_013', description='', dbxrefs=[]),
 (u'6229761', 1275, 1256),
 u'|||||||||||| |||||||')

In [23]:
for number, item in enumerate(mappable):
    if str("N") in item[0]:
        print number
        print item


2
(SeqRecord(seq=Seq('NCATGCATTAAAGCCATTGCC', IUPACAmbiguousDNA()), id='04-5-3', name='04-5-3_D01_009', description='', dbxrefs=[]), (u'3385262', 1093, 1074), u'||||||||||||||||||||')
19
(SeqRecord(seq=Seq('NANGAGGCACTTTGAAGAGCA', IUPACAmbiguousDNA()), id='36-5-19', name='36-5-19_D05_041', description='', dbxrefs=[]), (u'8426516', 1695, 1712), u'||||||||||||||||||')
20
(SeqRecord(seq=Seq('NANGAGGCACTTTGAAGAGCA', IUPACAmbiguousDNA()), id='36-5-19', name='36-5-19_D05_041', description='', dbxrefs=[]), (u'8424649', 3562, 3579), u'||||||||||||||||||')

In [24]:
mappable[7]


Out[24]:
(SeqRecord(seq=Seq('CTGTGCATGGGAGCCAGATCTG', IUPACAmbiguousDNA()), id='<unknown id>', name='18-5-9_B03_029', description='<unknown description>', dbxrefs=[]),
 (u'3476947', 1427, 1448),
 u'||||||||||||||||||||||')

In [25]:
t = printloc(mappable[10], ampsdict)
#t = printloc(mappable[10], ampsdict["9981706"])

In [26]:
print t[1].seq


ATTTATTCATAGGAGGCACTTTGAAGAGCAGACATCTGCAGTA

In [27]:
mappable[-1]


Out[27]:
(SeqRecord(seq=Seq('AGAAGGTGGTATGCTGATGGG', IUPACAmbiguousDNA()), id='90-5-45', name='90-5-45_B12_094', description='', dbxrefs=[]),
 (u'4286355', 1662, 1682),
 u'|||||||||||||||||||||')

In [28]:
primers_fwd = '''\
Fwd
TCCCTTTCTTTCCCGTTACC
AGAATAGGGACCGCATTGAC
GAGAGTCGGCGAGTCCATAA
AGAGCTGAAGCACCACAGGT
GCCGGATTCACTCAGATCA
ACCAGCCAACCAGACCATTA
GCCCGCTCCATTATAACAAG
TTCGTGCAATGGACAAGTAGA
GTTGACCCAAATGACCCATC
TGGTGAAGTGTGACGTAGCC
GGAATTGACATGGACAATGG
GGCTAAACCGGAATGAACAA
TGGGTCCCAATGATGAGTCT
CAGAGCTGCCTCATGACACT
GCAATGTGATGCGAAGTGTA
TAATCAGAGACTCGCCAACG
AAAGGATTGTGGACTCATGG
TGGACGCAGTTGTCATGATT
CGAAGAGGGATGCAGGACTA
AAGCGCCCTTCTTTCCTAAT
TAACAAGCCATTTGCCACCT
TTTCTGGAAGTTTGGACACTG
TGACTCAGAGGGTAGCAACT
ACCAAGAGTCCTCATGCACT
ACAATTCGCCAATCATTGCT
TCACACTTAACTGGGAGAATG
ATCATCCTGCGCCTAAGGTT
GCACTCTACACAAAGTTCTCG
CTGGGCCCAAAGTATCTCAT
TGTCACCCACTAATGTTTCAGG
CAAAGCTCACGTCAAATAAACG
GAACCAGAATGAGTGCTGTCC
AAAGACGGCCAGTATGCAGT
CAGTGTTCATCGGAACAAAGC
AGATCTTGGAGGCCCTGTTT
TGTGATTATGCAGAGGACAACC
TGCTCAATTACGGGTTTGGT
TTGGCCATACTTCAGCCAAT
CCGACCTGAACCCTCCTAAT
CTGTCTGTCTCTACCAATCACC
TGTGCTCTGTTGATGCGTCT
TCCTTCTCAGTATGCGCTGA
TGCAAGAGCGTCTGAATTTG
TATATTGCCTGGGCGCTAAC
TGTCACAACCCACTGATTCC
CCACTGATATAGTGTGGGCTAA
GTTACTGCCGTGAGGGATGA
AGTGATGGGTCTGCCAGAAT
AAACATGGTAAGCATCTGTGG
CAGTTATGGCTGCCTCGAA
GGGATTAGGGAGGATCAGGA
GCCAGGAATTGGCAGTAGTC
GACACGGGAAAGAAACATGA
TTTCAGTAGCCGCATCAGTG
CCAATTAAGCAGATTGGAGTTC
CCTTGTAATCCTACTGTGCCTA
GGCTTGCTCTGAGAAGGCTAT
TGCTGGAGTCCACCTGATTA
GACTGAACCGTCATTCCGATA
CGCCCACCAACTGAACTTAG
AGTGTGACGTCAGAGGCAAG
TTGCATTATTATGCGCTACTGG
GTTGTGAAATCTATTGCCTCCA
AGACACAATCTAATGAGGGATG
TTGGATGAGGTTGAGGCTTA
AGACTCCTGAGAGCCCATTT
CGTGCGATTGTTTCAGGTTT
GGAACGGTGTGTATGTCCAA
CCAAACCTAGGTGGTTCTCG
GGAACACTCATTAGGGAGCA
TCTTTACAGCACCTGCTTCTGA
GAGCCGAATAAAGTGACAAA
TGTGAATCAATCTGTCTTACGC
TATGATTGAGGGCCTTGTGG
CCAGTTCCAGGTGTGCCTA
CAGTGCCCACAAGGAGTAGG
AGAATAGGTGGATTCACTGAGG
AGTTGGGCAGGCCTAACATT
CCAATGGGCAGGAACTTATG
TCATCAACAACTGGAGTCTGC
ACGATGCAGCAATTCCCTAC
GGTACTGCCATCACCCTTGT
GCAGTGTGAGCCCAACAGTA
AGCCTGGACCTCTCCTTGAT
CCCATAAGTGCCGACTTCA
CCAGAAAGTAGGAGCCGATG
TCCCGGCTCTAAAGTAGTCTTG
AAAGTCAAGGGCTGCCATC
GGGAGAGCCCTTGGAATAAA
AACGATGTACAACACCAGTTGC
GCCAAGGATGAAACCAAATC
AATGGATCAATACCCTGTCC
CCACATAGCTTCCCTGTTCTTT
TGGTCATACCACACCAATGAA
CCAAGCTAGGCTTGAACTGG
TAGCCGCTTCGCAGTTTAAT
CCACCCTTCAGACTGGCTAC
AGGAAGGACATGGAATTAACTG
AATGCCCTCAAGTAGCATGG
GTCTTGAGGAAGCAGCAACC
TTTGCCCGGTGATAGAATGT
TCATGAGTTGCATATGGATGG
GTTCATTGATGGGTGCCAGT
TGGTGAACCTGTATCAAATACG
TCAGGAATGCCTTACTTGAGA
CTTGCAGGAACTTATGAACACA
TGAATGGATCCACCACAGAA
ATCCCAAGGGAACACGTAAG
GCCCACAGATTGCATTCAC
CGGCCCTGTCTCACAGTAA
CCAGGGTATTCTAACCCTATGC
TGGAGAATCCCAAGGATGTT
AACGTGCAACCTTTGAGTCC
TCCTCCTAAAGAAACGACGTG
TCCAAGCACTCCAACCTTGT
TTTCTGATGGGCCTCTGG
TCCTCGTAAGAGGTGTTTCCA
CACCCAACTCTTATGGTGGAA
ACCCGCCTCAATACCAAAGT
TCAGAATGGCTATGGCTGTG
CTAGCGGTTTATGAGCGTCAC
CCAACTCACACTCCAATAATCA
TCCATTGTAGCCGTGCTGTA
GCATTGCAGTTCCAATCAGA
GGCTGGACAAATACCACTGC
TCGTCAGAAGTTGTCCAAGG
CCACTATGGCCAACAAGAGAG
CCTGTGGGAAGTTATGAGACG
CTATTTGACCCGCAGTTTCC
TGGTTGCTCACATCACTGAA
CTCTTTGCAGATGAGCGTGA
GGAAGCTACTGCCGTCGTTA
TCCTTTATTGTCCCGCCATA
CCACATGTGCCTTGGTAAGA
CAACAGCAATCACCCTTCAA
AGGAGGATTATTGCACCCATA
TGTGCATCACACACTCTGGA
CTGGGACCACAGGGATAAAG
GTCGCACACATAAACGCAGT
AACAGGATCGGAGAGCATTG
ACACTCATGATAGTGACCTGCT
GGGAACCGTAGAGTTTATTGTG
AGGTTCCACAAGGAGGGAGT
ACATTGGCCTTGATCCTGAG
'''

In [29]:
primers_rev = '''\
Rev
GCGCCTAAGTGTCTTTGCAT
CAAGTGCAGAGCACCTTGAC
TGCCAACGTTTGTCTCTGAC
ATCACATGTGTCTCCAGGAA
GGCAGTTGGGACGTATTTGT
GCCAGTACCTGCCAGTAACC
TTGCTGGCACATTACCACTC
TTGGCTCCTGGACTGTCTTC
GAAACAGCCGTGTCCAGAT
TTCTGCAACGAACTGTCTCTG
GTTTCGGACCCACAATGG
TGTCACTAAAGCCTAGCAGAAA
CCCAGGAGATGGTCATAATC
GCCCTGAGTATCGGCATACA
TTTGTCAGCTTGTGGACCTG
AGGAATCCTATGCTATTTCTCG
CCCATGGTCCTTACAGACTGA
GAGCGTAGCACCACTTACGG
AACTGGTATGAATGCGCAAC
CCTGCTATCTCATCTTCCTTCA
CGAAATCCGGAAATCTCTGT
TCGGTCAGAATCACATCTGC
GCACTGGGATCTCAGGTTTG
CATTCTTCACGCTTGTTCCA
GCTGAAAGATACCTGCCAACA
GGGCAGGCTCTCTTAGTCAA
GCTGCTTGAATAATTCGTCTGC
TGACCGGAAATGTTGGAAAC
CTTGCACAAGTTGCTTCACA
GCTGTACCCGTGTAGGCTTT
GGCAAAGGGCTCCAGATATAA
CCCAGGAATAGAAGTCACGTTT
CTAATTGAATGCGTTCATGC
TAGGATGCTGCCCTATGGTC
GTCACCGACCATTCATTTCA
TTTCATCCCTTGTCATCCAT
GGGCTCTGGTCAAATGAT
CTTCTGCTCATGGGTTTGGT
CCAGTCTAGTGGCCAGGATT
AATTGAAGAGGAGACCCTGCT
AGGGCATCTCCAATGGTGTA
CTGCTGTACATCCAGGCTGA
CCATTACGGATGTAGTTCAGCA
GCAGTCGAGGCTTTGAGTCT
GCAAACCTTCAGGAGCATGT
AAAGACCCAGCAGGAATTGA
GGGTGTCTTAGAGGGTAACAAA
CACTCCATACAAAGCGCTCA
AATGGAAATCGCCACTATACG
ACAGACCCGCCCTGATGAT
CATTGACATGACACATTTCTCG
AGCTCTGTCCCAGGGTATAGT
AAGCCGTAAAGTGGAAGCAG
TGAGCTAACATTCTCAAGTCCA
TAATTGGGCTGTCCTTCACC
CCAGCTCAAGTTCGAGGAAA
ACATTCGCCGTAAAGCAAAG
AAATCCATTGGGCCTGCT
AAGGGACCATCTGGGTATGT
TGACCTGTACAACACCTTGTGA
TGTGCTACTGCCATGTACCC
CCATCTTAGGCCAACTTCCA
ACCCATCCTGGCACACTGTA
ACCCAAGGGTCTCACACTTC
GGGATTGAGTCAGGTGGGTTA
ATCCAAGTCCTGCCTGAGGT
TTTCATGTGAGGTTGCCAAT
AACACTTGTGTATCGGCCATC
TGAGCAACTTATTGAGGCACA
AGGGTTAGACGACTGCCAAG
TGCATTTAGACGTTTGGTTG
TCGAACTATCATCCCGCAGT
TCACCTGATGCCTTGGTAAA
CCCTCCAAATGAAGTGACCT
CCACCCAGGATCTATTTAGAGG
TTCGGCATCGCTTATTTACG
CTTTACGGATTGGGCAAGAA
TGACCCACTCAGCATAATGAA
TTTCTGGCAAGCACTCAGAA
AAACAAGGACATGCCACACA
TTCAATCCAAACGATGCAGA
TGCACCAGTCTATTCGGTCA
CCTGCATGCCTAGGGTATATT
CCCTGTGGTTGTCTAGCGTA
CCTAAGGCGCAATAGTGTGG
CTATCCAGAACCTCCCAGCA
AGAATACCACTGCTTGCTGAGA
CCTCTGCTGGCTACAGTTTG
CCATAAACCTTGGACGCAAC
TTTGAGTTGCCTGAACGTGA
GGTCTTCTTGGCCTTCCTAAA
GGTAGATACCCGTGGAATGC
GGGAGGGTATCCACATGAGA
GGAAGTGTAAGCTAAGGCTCA
ATTTCACGGCAAGCCAATTA
TTTGTCGCGCATCACTTT
CATTCCCTAAGGCATTTGTTTC
CCCTTAGAGGACAACGGAGA
TCGAGCATGGTCTGCATTAG
ACCTCTGTTGGTCCCTATGC
GGATTACAGTGGCCATATCGTT
TTAAGGAGCTGATGATTCCAG
CTACATGCCTTGGGCTTAGG
CTCAGGGTTCCTGTGCTCTC
CCCTCTTAGGGTATACGGGTTA
AATTTGGGTCGTGCGTATGT
ACTCCACTGAGGCCCAGATA
TTGGAAGGGCCATGTATAGG
AAGGGACGGTTTAGGGTCAG
CACGTGAGCTTCGGATGTTA
TTGAATGCATAGCACCTTTG
TGGCCTTGATCCTTCAGTTC
ATTTCAAATGCCCAAACGAC
TAACACCATGGCCGAGATTT
AAGTTCTCCAGGCGAATCAG
AGGTGTTTACCGAAGGCAGA
CGAGCTGTTGGTCATTGCTA
TCCAACCATTCCAAAGTCAA
CATCAGCACAAGCAGTCGTT
TGGTCTGATGTGACGAAAGC
TCTATCCATGGAGTCATTTGG
CTGGATGGCCAACTTCTGTC
GGAGACAAGACCGTGACACA
AACCTTGGCCAGGTATTATG
CCACATTTGTAAACGGCTCA
CCCATATTTGCGACATGTGTT
CCCATCAGCATACCACCTTC
CCCAGATTCCTGCCCATT
GATTTCGGGTGCATTGTCTT
GGACAACAGCTATGGCTTGC
TGTGTGTTATGGCGATGTCC
TTGCCTATAATTGAGCCAGAGA
AAACCATAGATCCTGGTTCAG
TTCAGATACTTCATCCTCAACC
ATGTATATTCACGCCTGTGG
GCTGCACAGAGATTCGATGA
CATGTCCAGGCAGTCCAAT
GGCAGGGTCCATCTACAGTT
CCCTCTCTCGGCTCCTATCT
TGCTCTTCAAAGTGCCTCCT
GGCCATCTGAGACTTTGCAC
CTGCAGCAATGGCCTTAAAT
AACTTGAGCGCAGGGAACT
AGATCTGGCTCCCATGCAC
'''

In [30]:
import io
primers_fwd = pandas.read_table(io.BytesIO(primers_fwd))
primers_rev = pandas.read_table(io.BytesIO(primers_rev))

In [31]:
fwdprimerlist = []
for index,item in [list(x) for x in primers_fwd.itertuples()]:
    fwdprimerlist.append(item)

revprimerlist = []
for index,item in [list(x) for x in primers_rev.itertuples()]:
    revprimerlist.append(item)

In [32]:
revprimersreversed = []
for item in revprimerlist:
    item = Seq(item, IUPACAmbiguousDNA()).reverse_complement()
    revprimersreversed.append(item)
revprimerslist = revprimersreversed

In [33]:
revprimerlist = copy.deepcopy(revprimersreversed)

In [34]:
len(revprimersreversed)


Out[34]:
144

In [35]:
f = []
for item in fwdprimerlist:
    item = Seq(item, IUPACAmbiguousDNA())
    f.append(item)
fwdprimerlist = copy.deepcopy(f)

In [36]:
fwdprimerlist


Out[36]:
[Seq('TCCCTTTCTTTCCCGTTACC', IUPACAmbiguousDNA()),
 Seq('AGAATAGGGACCGCATTGAC', IUPACAmbiguousDNA()),
 Seq('GAGAGTCGGCGAGTCCATAA', IUPACAmbiguousDNA()),
 Seq('AGAGCTGAAGCACCACAGGT', IUPACAmbiguousDNA()),
 Seq('GCCGGATTCACTCAGATCA', IUPACAmbiguousDNA()),
 Seq('ACCAGCCAACCAGACCATTA', IUPACAmbiguousDNA()),
 Seq('GCCCGCTCCATTATAACAAG', IUPACAmbiguousDNA()),
 Seq('TTCGTGCAATGGACAAGTAGA', IUPACAmbiguousDNA()),
 Seq('GTTGACCCAAATGACCCATC', IUPACAmbiguousDNA()),
 Seq('TGGTGAAGTGTGACGTAGCC', IUPACAmbiguousDNA()),
 Seq('GGAATTGACATGGACAATGG', IUPACAmbiguousDNA()),
 Seq('GGCTAAACCGGAATGAACAA', IUPACAmbiguousDNA()),
 Seq('TGGGTCCCAATGATGAGTCT', IUPACAmbiguousDNA()),
 Seq('CAGAGCTGCCTCATGACACT', IUPACAmbiguousDNA()),
 Seq('GCAATGTGATGCGAAGTGTA', IUPACAmbiguousDNA()),
 Seq('TAATCAGAGACTCGCCAACG', IUPACAmbiguousDNA()),
 Seq('AAAGGATTGTGGACTCATGG', IUPACAmbiguousDNA()),
 Seq('TGGACGCAGTTGTCATGATT', IUPACAmbiguousDNA()),
 Seq('CGAAGAGGGATGCAGGACTA', IUPACAmbiguousDNA()),
 Seq('AAGCGCCCTTCTTTCCTAAT', IUPACAmbiguousDNA()),
 Seq('TAACAAGCCATTTGCCACCT', IUPACAmbiguousDNA()),
 Seq('TTTCTGGAAGTTTGGACACTG', IUPACAmbiguousDNA()),
 Seq('TGACTCAGAGGGTAGCAACT', IUPACAmbiguousDNA()),
 Seq('ACCAAGAGTCCTCATGCACT', IUPACAmbiguousDNA()),
 Seq('ACAATTCGCCAATCATTGCT', IUPACAmbiguousDNA()),
 Seq('TCACACTTAACTGGGAGAATG', IUPACAmbiguousDNA()),
 Seq('ATCATCCTGCGCCTAAGGTT', IUPACAmbiguousDNA()),
 Seq('GCACTCTACACAAAGTTCTCG', IUPACAmbiguousDNA()),
 Seq('CTGGGCCCAAAGTATCTCAT', IUPACAmbiguousDNA()),
 Seq('TGTCACCCACTAATGTTTCAGG', IUPACAmbiguousDNA()),
 Seq('CAAAGCTCACGTCAAATAAACG', IUPACAmbiguousDNA()),
 Seq('GAACCAGAATGAGTGCTGTCC', IUPACAmbiguousDNA()),
 Seq('AAAGACGGCCAGTATGCAGT', IUPACAmbiguousDNA()),
 Seq('CAGTGTTCATCGGAACAAAGC', IUPACAmbiguousDNA()),
 Seq('AGATCTTGGAGGCCCTGTTT', IUPACAmbiguousDNA()),
 Seq('TGTGATTATGCAGAGGACAACC', IUPACAmbiguousDNA()),
 Seq('TGCTCAATTACGGGTTTGGT', IUPACAmbiguousDNA()),
 Seq('TTGGCCATACTTCAGCCAAT', IUPACAmbiguousDNA()),
 Seq('CCGACCTGAACCCTCCTAAT', IUPACAmbiguousDNA()),
 Seq('CTGTCTGTCTCTACCAATCACC', IUPACAmbiguousDNA()),
 Seq('TGTGCTCTGTTGATGCGTCT', IUPACAmbiguousDNA()),
 Seq('TCCTTCTCAGTATGCGCTGA', IUPACAmbiguousDNA()),
 Seq('TGCAAGAGCGTCTGAATTTG', IUPACAmbiguousDNA()),
 Seq('TATATTGCCTGGGCGCTAAC', IUPACAmbiguousDNA()),
 Seq('TGTCACAACCCACTGATTCC', IUPACAmbiguousDNA()),
 Seq('CCACTGATATAGTGTGGGCTAA', IUPACAmbiguousDNA()),
 Seq('GTTACTGCCGTGAGGGATGA', IUPACAmbiguousDNA()),
 Seq('AGTGATGGGTCTGCCAGAAT', IUPACAmbiguousDNA()),
 Seq('AAACATGGTAAGCATCTGTGG', IUPACAmbiguousDNA()),
 Seq('CAGTTATGGCTGCCTCGAA', IUPACAmbiguousDNA()),
 Seq('GGGATTAGGGAGGATCAGGA', IUPACAmbiguousDNA()),
 Seq('GCCAGGAATTGGCAGTAGTC', IUPACAmbiguousDNA()),
 Seq('GACACGGGAAAGAAACATGA', IUPACAmbiguousDNA()),
 Seq('TTTCAGTAGCCGCATCAGTG', IUPACAmbiguousDNA()),
 Seq('CCAATTAAGCAGATTGGAGTTC', IUPACAmbiguousDNA()),
 Seq('CCTTGTAATCCTACTGTGCCTA', IUPACAmbiguousDNA()),
 Seq('GGCTTGCTCTGAGAAGGCTAT', IUPACAmbiguousDNA()),
 Seq('TGCTGGAGTCCACCTGATTA', IUPACAmbiguousDNA()),
 Seq('GACTGAACCGTCATTCCGATA', IUPACAmbiguousDNA()),
 Seq('CGCCCACCAACTGAACTTAG', IUPACAmbiguousDNA()),
 Seq('AGTGTGACGTCAGAGGCAAG', IUPACAmbiguousDNA()),
 Seq('TTGCATTATTATGCGCTACTGG', IUPACAmbiguousDNA()),
 Seq('GTTGTGAAATCTATTGCCTCCA', IUPACAmbiguousDNA()),
 Seq('AGACACAATCTAATGAGGGATG', IUPACAmbiguousDNA()),
 Seq('TTGGATGAGGTTGAGGCTTA', IUPACAmbiguousDNA()),
 Seq('AGACTCCTGAGAGCCCATTT', IUPACAmbiguousDNA()),
 Seq('CGTGCGATTGTTTCAGGTTT', IUPACAmbiguousDNA()),
 Seq('GGAACGGTGTGTATGTCCAA', IUPACAmbiguousDNA()),
 Seq('CCAAACCTAGGTGGTTCTCG', IUPACAmbiguousDNA()),
 Seq('GGAACACTCATTAGGGAGCA', IUPACAmbiguousDNA()),
 Seq('TCTTTACAGCACCTGCTTCTGA', IUPACAmbiguousDNA()),
 Seq('GAGCCGAATAAAGTGACAAA', IUPACAmbiguousDNA()),
 Seq('TGTGAATCAATCTGTCTTACGC', IUPACAmbiguousDNA()),
 Seq('TATGATTGAGGGCCTTGTGG', IUPACAmbiguousDNA()),
 Seq('CCAGTTCCAGGTGTGCCTA', IUPACAmbiguousDNA()),
 Seq('CAGTGCCCACAAGGAGTAGG', IUPACAmbiguousDNA()),
 Seq('AGAATAGGTGGATTCACTGAGG', IUPACAmbiguousDNA()),
 Seq('AGTTGGGCAGGCCTAACATT', IUPACAmbiguousDNA()),
 Seq('CCAATGGGCAGGAACTTATG', IUPACAmbiguousDNA()),
 Seq('TCATCAACAACTGGAGTCTGC', IUPACAmbiguousDNA()),
 Seq('ACGATGCAGCAATTCCCTAC', IUPACAmbiguousDNA()),
 Seq('GGTACTGCCATCACCCTTGT', IUPACAmbiguousDNA()),
 Seq('GCAGTGTGAGCCCAACAGTA', IUPACAmbiguousDNA()),
 Seq('AGCCTGGACCTCTCCTTGAT', IUPACAmbiguousDNA()),
 Seq('CCCATAAGTGCCGACTTCA', IUPACAmbiguousDNA()),
 Seq('CCAGAAAGTAGGAGCCGATG', IUPACAmbiguousDNA()),
 Seq('TCCCGGCTCTAAAGTAGTCTTG', IUPACAmbiguousDNA()),
 Seq('AAAGTCAAGGGCTGCCATC', IUPACAmbiguousDNA()),
 Seq('GGGAGAGCCCTTGGAATAAA', IUPACAmbiguousDNA()),
 Seq('AACGATGTACAACACCAGTTGC', IUPACAmbiguousDNA()),
 Seq('GCCAAGGATGAAACCAAATC', IUPACAmbiguousDNA()),
 Seq('AATGGATCAATACCCTGTCC', IUPACAmbiguousDNA()),
 Seq('CCACATAGCTTCCCTGTTCTTT', IUPACAmbiguousDNA()),
 Seq('TGGTCATACCACACCAATGAA', IUPACAmbiguousDNA()),
 Seq('CCAAGCTAGGCTTGAACTGG', IUPACAmbiguousDNA()),
 Seq('TAGCCGCTTCGCAGTTTAAT', IUPACAmbiguousDNA()),
 Seq('CCACCCTTCAGACTGGCTAC', IUPACAmbiguousDNA()),
 Seq('AGGAAGGACATGGAATTAACTG', IUPACAmbiguousDNA()),
 Seq('AATGCCCTCAAGTAGCATGG', IUPACAmbiguousDNA()),
 Seq('GTCTTGAGGAAGCAGCAACC', IUPACAmbiguousDNA()),
 Seq('TTTGCCCGGTGATAGAATGT', IUPACAmbiguousDNA()),
 Seq('TCATGAGTTGCATATGGATGG', IUPACAmbiguousDNA()),
 Seq('GTTCATTGATGGGTGCCAGT', IUPACAmbiguousDNA()),
 Seq('TGGTGAACCTGTATCAAATACG', IUPACAmbiguousDNA()),
 Seq('TCAGGAATGCCTTACTTGAGA', IUPACAmbiguousDNA()),
 Seq('CTTGCAGGAACTTATGAACACA', IUPACAmbiguousDNA()),
 Seq('TGAATGGATCCACCACAGAA', IUPACAmbiguousDNA()),
 Seq('ATCCCAAGGGAACACGTAAG', IUPACAmbiguousDNA()),
 Seq('GCCCACAGATTGCATTCAC', IUPACAmbiguousDNA()),
 Seq('CGGCCCTGTCTCACAGTAA', IUPACAmbiguousDNA()),
 Seq('CCAGGGTATTCTAACCCTATGC', IUPACAmbiguousDNA()),
 Seq('TGGAGAATCCCAAGGATGTT', IUPACAmbiguousDNA()),
 Seq('AACGTGCAACCTTTGAGTCC', IUPACAmbiguousDNA()),
 Seq('TCCTCCTAAAGAAACGACGTG', IUPACAmbiguousDNA()),
 Seq('TCCAAGCACTCCAACCTTGT', IUPACAmbiguousDNA()),
 Seq('TTTCTGATGGGCCTCTGG', IUPACAmbiguousDNA()),
 Seq('TCCTCGTAAGAGGTGTTTCCA', IUPACAmbiguousDNA()),
 Seq('CACCCAACTCTTATGGTGGAA', IUPACAmbiguousDNA()),
 Seq('ACCCGCCTCAATACCAAAGT', IUPACAmbiguousDNA()),
 Seq('TCAGAATGGCTATGGCTGTG', IUPACAmbiguousDNA()),
 Seq('CTAGCGGTTTATGAGCGTCAC', IUPACAmbiguousDNA()),
 Seq('CCAACTCACACTCCAATAATCA', IUPACAmbiguousDNA()),
 Seq('TCCATTGTAGCCGTGCTGTA', IUPACAmbiguousDNA()),
 Seq('GCATTGCAGTTCCAATCAGA', IUPACAmbiguousDNA()),
 Seq('GGCTGGACAAATACCACTGC', IUPACAmbiguousDNA()),
 Seq('TCGTCAGAAGTTGTCCAAGG', IUPACAmbiguousDNA()),
 Seq('CCACTATGGCCAACAAGAGAG', IUPACAmbiguousDNA()),
 Seq('CCTGTGGGAAGTTATGAGACG', IUPACAmbiguousDNA()),
 Seq('CTATTTGACCCGCAGTTTCC', IUPACAmbiguousDNA()),
 Seq('TGGTTGCTCACATCACTGAA', IUPACAmbiguousDNA()),
 Seq('CTCTTTGCAGATGAGCGTGA', IUPACAmbiguousDNA()),
 Seq('GGAAGCTACTGCCGTCGTTA', IUPACAmbiguousDNA()),
 Seq('TCCTTTATTGTCCCGCCATA', IUPACAmbiguousDNA()),
 Seq('CCACATGTGCCTTGGTAAGA', IUPACAmbiguousDNA()),
 Seq('CAACAGCAATCACCCTTCAA', IUPACAmbiguousDNA()),
 Seq('AGGAGGATTATTGCACCCATA', IUPACAmbiguousDNA()),
 Seq('TGTGCATCACACACTCTGGA', IUPACAmbiguousDNA()),
 Seq('CTGGGACCACAGGGATAAAG', IUPACAmbiguousDNA()),
 Seq('GTCGCACACATAAACGCAGT', IUPACAmbiguousDNA()),
 Seq('AACAGGATCGGAGAGCATTG', IUPACAmbiguousDNA()),
 Seq('ACACTCATGATAGTGACCTGCT', IUPACAmbiguousDNA()),
 Seq('GGGAACCGTAGAGTTTATTGTG', IUPACAmbiguousDNA()),
 Seq('AGGTTCCACAAGGAGGGAGT', IUPACAmbiguousDNA()),
 Seq('ACATTGGCCTTGATCCTGAG', IUPACAmbiguousDNA())]

In [37]:
ampsdict["10206893"]


Out[37]:
Seq('GGAGAATTCAACCCATAATAAAAAAAAGGCTCCCCCCTACCCTGGGTAGACTCC...CGC', SingleLetterAlphabet())

In [38]:
fwdprimerlist.extend(revprimerlist)

In [39]:
f = []
for index, item in enumerate(fwdprimerlist):
    item = SeqRecord(item)
    item.id = str(index)
    item.description = str(index)
    item.name = str(index)
    f.append(item)

In [40]:
with open("288primers.fasta", "w") as handle:
        SeqIO.write(f, handle, "fasta")

Made a BLAST database from 288 primers


In [41]:
# To match against the primer BLAST database, which of the mappable 20mers are basically fragment ends?
# First, write out a query fasta containing the mappable targets:
m = [j for j,k,l in copy.deepcopy(mappable)]

In [42]:
for item in m:
    item.id = item.name

In [43]:
Bio.SeqIO.write(m, "mappablesequencedtgts.fasta", "fasta")


Out[43]:
42

In [44]:
blastn_cline = NcbiblastnCommandline(query="mappablesequencedtgts.fasta", db="288prim", \
task = "blastn-short",outfmt=5, out="288prim.blast", max_target_seqs=100, num_threads = 7, evalue = 0.01)
timeit.timeit(blastn_cline, number =1)


Out[44]:
0.25040197372436523

In [45]:
result_handle = open("288prim.blast")
prim_blast_records = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here

prim_blast_records_list = []
for blast_record in prim_blast_records:
    prim_blast_records_list.append(blast_record)
result_handle.close()

In [46]:
counter = 0
for item in prim_blast_records_list:
    for i, h in enumerate(item.alignments):
        print i
        for j in h.hsps:
            print j
            counter = counter + 1
print counter


0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 CCATGCTACTTGAGGGCATT 21
               ||||||||||||||||||||
Sbjct:      20 CCATGCTACTTGAGGGCATT 1
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       2 TAATGGTCTGGTTGGCTGGT 21
               ||||||||||||||||||||
Sbjct:      20 TAATGGTCTGGTTGGCTGGT 1
0
Score 21 (42 bits), expectation 1.1e-08, alignment length 21
Query:       1 CCATCCATATGCAACTCATGA 21
               |||||||||||||||||||||
Sbjct:      21 CCATCCATATGCAACTCATGA 1
0
Score 19 (38 bits), expectation 1.6e-07, alignment length 19
Query:       3 GTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||
Sbjct:       1 GTGCATGGGAGCCAGATCT 19
0
Score 19 (38 bits), expectation 1.6e-07, alignment length 19
Query:       3 GTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||
Sbjct:       1 GTGCATGGGAGCCAGATCT 19
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 TACAGCACGGCTACAATGGA 20
               ||||||||||||||||||||
Sbjct:      20 TACAGCACGGCTACAATGGA 1
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       2 AGGAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||||
Sbjct:       1 AGGAGGCACTTTGAAGAGCA 20
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       2 AGGAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||||
Sbjct:       1 AGGAGGCACTTTGAAGAGCA 20
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       2 GGTGAAGGACAGCCCAATTA 21
               ||||||||||||||||||||
Sbjct:       1 GGTGAAGGACAGCCCAATTA 20
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       2 TACAGCACGGCTACAATGGA 21
               ||||||||||||||||||||
Sbjct:      20 TACAGCACGGCTACAATGGA 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 CCATGCTACTTGAGGGCATT 20
               ||||||||||||||||||||
Sbjct:      20 CCATGCTACTTGAGGGCATT 1
0
Score 18 (36 bits), expectation 6.9e-07, alignment length 18
Query:       4 GAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||
Sbjct:       3 GAGGCACTTTGAAGAGCA 20
0
Score 18 (36 bits), expectation 6.9e-07, alignment length 18
Query:       4 GAGGCACTTTGAAGAGCA 21
               ||||||||||||||||||
Sbjct:       3 GAGGCACTTTGAAGAGCA 20
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 TCTTACCAAGGCACATGTGG 21
               ||||||||||||||||||||
Sbjct:      20 TCTTACCAAGGCACATGTGG 1
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       1 CTCAAGTAAGGCATTCCTGA 20
               ||||||||||||||||||||
Sbjct:      20 CTCAAGTAAGGCATTCCTGA 1
0
Score 19 (38 bits), expectation 1.8e-07, alignment length 19
Query:       3 GTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||
Sbjct:       1 GTGCATGGGAGCCAGATCT 19
0
Score 19 (38 bits), expectation 1.8e-07, alignment length 19
Query:       3 GTGCATGGGAGCCAGATCT 21
               |||||||||||||||||||
Sbjct:       1 GTGCATGGGAGCCAGATCT 19
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       2 TAACGACGGCAGTAGCTTCC 21
               ||||||||||||||||||||
Sbjct:      20 TAACGACGGCAGTAGCTTCC 1
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       1 GCAGGGTCTCCTCTTCAATT 20
               ||||||||||||||||||||
Sbjct:       2 GCAGGGTCTCCTCTTCAATT 21
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 TGTGTCACGGTCTTGTCTCC 21
               ||||||||||||||||||||
Sbjct:       1 TGTGTCACGGTCTTGTCTCC 20
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       1 TAACGACGGCAGTAGCTTCC 20
               ||||||||||||||||||||
Sbjct:      20 TAACGACGGCAGTAGCTTCC 1
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       2 TTTACCAAGGCATCAGGTGA 21
               ||||||||||||||||||||
Sbjct:       1 TTTACCAAGGCATCAGGTGA 20
1
Score 11 (22 bits), expectation 9.6e-03, alignment length 11
Query:       3 TTACCAAGGCA 13
               |||||||||||
Sbjct:      18 TTACCAAGGCA 8
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 CGTTGGCGAGTCTCTGATTA 21
               ||||||||||||||||||||
Sbjct:      20 CGTTGGCGAGTCTCTGATTA 1
0
Score 20 (40 bits), expectation 4.1e-08, alignment length 20
Query:       2 CCATGCTACTTGAGGGCATT 21
               ||||||||||||||||||||
Sbjct:      20 CCATGCTACTTGAGGGCATT 1
0
Score 20 (40 bits), expectation 4.4e-08, alignment length 20
Query:       2 GAAGGTGGTATGCTGATGGG 21
               ||||||||||||||||||||
Sbjct:       1 GAAGGTGGTATGCTGATGGG 20
26

In [46]:


In [47]:
# Generate a list of the mappable targets that didn't get matches against the primer library
mappable_noends = []
for item in prim_blast_records_list:
    if len(item.alignments) == 0:
        mappable_noends.append(item.query.split()[0])

So, 25 of the 42 mappable matches are the ends of PCR products. Next, filter the mappable list to remove these. Then check PAM-adjacency.


In [48]:
len(mappable_noends)


Out[48]:
17

In [49]:
len(mappable[0])


Out[49]:
3

In [50]:
d = []
for item, j, k in mappable:
    if item.name in mappable_noends:
        d.append((item, j, k))

In [51]:
mappable[0]


Out[51]:
(SeqRecord(seq=Seq('ACCATGCTACTTGAGGGCATT', IUPACAmbiguousDNA()), id='02-5-1', name='02-5-1_B01_013', description='', dbxrefs=[]),
 (u'986031', 3381, 3361),
 u'|||||||||||||||||||||')

In [52]:
pamlist = []
for item in d:
    t = printloc(item, ampsdict)
    if t[1].features[0].strand == 1:
        pamlist.append((t, t[2][t[-2]-1:t[-2]+2]))
    if t[1].features[0].strand == -1:
        pamlist.append((t, t[2][t[-3]-2:t[-3]+1].reverse_complement()))

In [53]:
print(t[0])


1  GCATACAGACTTATAGCATATTCATCAAGCCAAGTCATGTAATATTAAGGGAGCAGGATTAAGAGCAGATGAGATGGCAGTGAGGAAATGACATGCCAGT   100
1  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   100
1                                                                                                         100
101  TTGTTATCTATTACCCAGTCTGATTATCATGAGTTGCATATGGATGGGGACACACATGCTGTCTGGTGGGCAGAGAAGTTGAACCATCAGCTGTCATGGA   200
101  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   200
101                                                                                                         200
201  GTTGTACTTAAACAACAGCTGCGGGACCACAAGTTGCATATTGTTACTGTTTTCATCTTGTCCTACTGTCTCCATCTTGACCTATTTTTGCCATCTTGTT   300
201  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   300
201                                                                                                         300
301  CTACTGTCTCCATCTTCCTTACTGTTACTATCTTGCTCTACTGCCTCCATTGTTAAAAGGCCCACATTCAACATAGTCTTAATGATCCCTCTATTTATTT   400
301  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   400
301                                                                                                         400
401  CTCCACAGTGTAGTTCTCCAACCCAGGGTACAGGATGCCATTTTTTGGGAGGGGTCATTGCATATATATGATAAGCCCCAGGCTTGGCATTTTGATAAGC   500
401  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   500
401                                                                                                         500
501  AGGGGTTGCAAAGAGGACTACAACATAGCACATCAATACATGGCTTATTTTATCATGTAGTAGAGCCACCGGCCATGATGTATGTGCCATGTGCTGCCAT   600
501  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   600
501                                                                                                         600
601  AAGCTGTAAAGTGCCAGTACCCAACTCACACTGCAAATGCCCTTAGTCTGAGGATTTAAGGGAGAAGGCAGAACTAAAGAAAGGGGCTGTGAGAAAGTAC   700
601  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   700
601                                                                                                         700
701  TGGCGCACACAACATATAATGTAATATATCGTACTAATAAAACAAAGGCTTTATTTACTGTTCGCTTTACTTTCCCTTTAAGCCACCAAAGATAAATTAG   800
701  ---------^---------^---------^---------^---------^---------^---------^---------^---------^---------^   800
701                                                                                                         800
801  TATCTCTCTAAAGACAGAATACTAGCTGATGTTCACATACAGGCATCTTTCCACTATTCATTCTTCCCACAAAGTCCTCTCTGCCCTACGGCCATATGCC   900
801  ---------^---------^----<<<<<<<<<<<<<<<<<<<<------^---------^---------^---------^---------^---------   900
801                         cutsite_rev                                                                     900
901  TCCTGACAGTGGTATGATGATACAATGCCCAGCACAGGGTCTAAAAACCATCTGCAAAAGAATAGACTGGAATAAACAAAGCTGCCATATTCCCCATGCA   1000
901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1000
901                                                                                                         1000
1001  ACTTTCCACCCACCCCCATAAATCCCACCCTCCCCATATTTTTGGCCAAGGGTAGAGAAGCTAAGTTGACCCTGGCAATGGCTTTAATGCATGGGGAATG   1100
1001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1100
1001                                                                                                         1100
1101  GGAAGCTCTTGGATGCTCTGCGGGAAGTGTGCAGGTTGCAGACACTGCAGAATCCTTTGCATTGCTGCTAAGTACAACCGGGCCGGCCGTTTGCCCACCA   1200
1101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1200
1101                                                                                                         1200
1201  GGTAGTTAAATTTGTCCTCTCCCAGGAGGGGACTCCAAAACCCTGGGTTGTCTCTCAGCATCTCTCTCATTTTAAAGTTGAGGGAAGATGAGAAAAGCAA   1300
1201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1300
1201                                                                                                         1300
1301  GAGATAATCCAGGCAGAGCTTTTTGGACTCCACGTTGACTTTGGAACTCCACAGCTGCTCCAAAATGACGTCCAAGGGGGTTTCCCCTTTGCGGTCAATT   1400
1301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1400
1301                                                                                                         1400
1401  ATTTTGGGTGAGGCCCCGTTGCCTAGAAGAGTCAAGACAGTTTCGGCGCTCAGAAGTTCACAGGCCAAGTGCAAAGGTGTTTTACCATCCGACAAATGAA   1500
1401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1500
1401                                                                                                         1500
1501  AACAACTCGCCCTGTTGATGTAGGACTGAAGGCTGGGCAGCTCATGGATCACCTTAATGATCATGACTAAAATGTCCCTCCGATCATAAGTAACCGCCAG   1600
1501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1600
1501                                                                                                         1600
1601  GGCTAGATGAGGTGCAGAGGGTGGACAGCAGCAGAATTTGATGCCTGGCACCTTCAAGGCTTCCTCTGCATAGTGAGACAGCAGATACTGAGCATAGCGT   1700
1601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1700
1601                                                                                                         1700
1701  AGGTGGTTATGCACTACAGCGTACATGAGGGCGTCGGATGGTAAGTAGCACCTAAGACTTGCGTCTTCCTCCAAATGAAAATACTCCATAGTCCTCATTT   1800
1701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1800
1701                                                                                                         1800
1801  CTTCCAGTTTCCACACCGGCTCCTGGTCCCGCACCGCCTGGTAGAACATATAGGCGTAGTACTTGCACTGCTCATCCTGCAAGCTGACTTGATTGCTGCA   1900
1801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1900
1801                                                                                                         1900
1901  CATGTCTCCTGCCCAGGGAGAGGGAAACAAAGAGAAGAGTCAATGGTTTCCCTGCAGGGTTCCACGAGGAACGTCTTTCATGTGTCGCCGCCGTCACGCA   2000
1901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2000
1901                                                                                                         2000
2001  GTCCTTCCAGGGAGCAGAGCCTGGTCAGATGCCACCATCACTGCTGATTTATTCCTGGAGAACAAGCCTCCTGCTTGAGTTAACACTTGTCTCACCGATT   2100
2001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2100
2001                                                                                                         2100
2101  CCTCCCAATTCCTTCTGCCCGGCAGCCTAAAGTATGGGCACTGGTTACAGAATGGCATGGGATCCTATCATATAAAGGATGATCAGTCCATAGGGTTGAT   2200
2101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2200
2101                                                                                                         2200
2201  TGAATCCGATAACCTTGAACCTTCCTCTTTCTTCTCCACCCACCTGCCCCTCACGGTTTCCCTGCAGCCTTTTGCTGCATTTGGATGATGGGAAGCTGCT   2300
2201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2300
2201                                                                                                         2300
2301  GTCGGCTCACTAGCCTGCTGCTCCCTGCAGTGGTGAATGGAAGGGTGGGGGTTGCAATCTATTGAGGGTTTTGCTGTGTAGGAAGAAAAATACTAAGTAT   2400
2301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2400
2301                                                                                                         2400
2401  ATATCAAGTGGATCTCCGGTAGAGTCAGAGGCTTTCTGATAGGGAGGAGACTAGCAGCAGCCCCTCCGTTCTGCACCCACAGTGCAATTATAGCACAAAT   2500
2401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2500
2401                                                                                                         2500
2501  CTGCAGACACTGGAGCGCTGGAATCATCAGCTCCTTAATAGTGACTGAATATTAAAATGAACACAGGATTCAGCCAAACCCCAGTACTTTCTGCAGGATT   2600
2501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2600
2501                                                                                                         2600
2601  CTTTAGTTTTTAGCCAAGCCCAGTCCAAACTCATGCAAATTTAAAGCAGTGGTCAAGCAAATATAACACTGTTTTTGTTCACAATTGATTGACAAATGTG   2700
2601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2700
2601                                                                                                         2700
2701  AATATGCAAATTAGGAGTCCATTCAGGAACTGTATGCACCAAACATATGTTCATCCGAATCCATATTGCCACACTGCACACGAGTCGTGGTGTACACGAT   2800
2701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2800
2701                                                                                                         2800
2801  CCCCCTGTGTGGTTTTGAACTGGGCATCAACACTTTAAAACATTGGATAGAAATTCAGCCAGTTGCATCAAAGTGATGCAATAACCTTGCCCTGGTGTCA   2900
2801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2900
2801                                                                                                         2900
2901  TAACTCCACCAACATTATTGTGCCCACCTCTGATGTCATCATGCCTGTTCCTACATCACTGCCCCACTCCCAATGTTATCTGCCCCGCCTAGGGTTGCCA   3000
2901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3000
2901                                                                                                         3000
3001  CCTGGCCGTTATTTTACCGGCCTGGTCGATAAAAATGATGGTTGATCCCAATGTTATTAATAGGGAAAAAAGATAAATATATAGGAAGGCCGGTATTTTT   3100
3001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3100
3001                                                                                                         3100
3101  TTCCAGAAAAGGGGGCAACCCTAGCCCCACCCCCCATGTTCGGGTTTAGCCATCAGCAAAGGTGGCAACTCTAGTACAGCTCTTCTAATAATGTTGGAGT   3200
3101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3200
3101                                                                                                         3200
3201  ACAACCCTGTTGGCTGTCATGAATGGTTTGAGAGTTGTAGTTTAACAGCTGAAGGGTCATAGTTATCCATCATTCTGTGTGTATGTCACATGGTGTAACT   3300
3201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3300
3201                                                                                                         3300
3301  GAATAAACAAAGGCAAAACACACATGGATATGGGAACACCTGATATAAACTACTTGTCACTAGCCCCTCCCACCTATACACAGTCAGTAATAGCAAAGCA   3400
3301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3400
3301                                                                                                         3400
3401  AGGGTTTGTTGAAAGTTGCAGTTCAGCTGGAAGGCTACAGACTAGACAGCATTGCTCTATATGAATAATAGGGTTGTATATATGGTTGCCACCTGGCCGG   3500
3401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3500
3401                                                                                                         3500
3501  TATTTTACTGGCCTGGCCGGTAAAAATGATGGTTGATCCCAATGTTATTAATAGGGGAAAAAGATAAATATAAAGGAAGGCCGGTGGCAACCCTAGTTGT   3600
3501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3600
3501                                                                                                         3600
3601  ATATTGGAAGTGGGGACTCTGGGACCTCGTCTTCTGGTTATTGACTAATGTACTACTGTAAGGAAGGCAGGAGTTGATGGGAGTTCTGTGCAGTGAATGT   3700
3601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3700
3601                                                                                                         3700
3701  GAAAATGTGTGATATATTCAGGGAAGTGTTCCCCTGTGAGACTGTACAAAGGAGTGAGGACATCAGTGTTCTTATTAACTATGCGCCGATACAGATGGAT   3800
3701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3800
3701                                                                                                         3800
3801  CCATCAGTTTTGGTGTAAATTATGCATAGGACCTGCATGAATTATTCAACACCCCTCTTGAAAATCACTGTATTCTGTGCTTAGTCCTCAAATCCACAAT   3900
3801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3900
3801                                                                                                         3900
3901  AGGATTTTTTTCCAGTAGTTTAAATTCTCTCCCTCTTTTTACTGCTAAGGGGAAGCCCCAGGGGCTAATATACAGCTGTGGGTTAAAAACACCCCATTCC   4000
3901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4000
3901                                                                                                         4000
4001  CTTTATGTATCAACTACTGAACCACTATACCTTCAGATTGGCCTGCTGTCATAGTTTTATGGTATCTCTCTGTACAGACTATGAGCAAACTTAGGGGCTG   4100
4001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4100
4001                                                                                                         4100
4101  TTCCTGCTGAATTGTGCTTAGTACAGGGGGATACCTATGCTGCCATAGTTTTATGGTATCTCTCTGTACAGACTATGAGCAAACTTAGGGGCTGTTCCTG   4200
4101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4200
4101                                                                                                         4200
4201  CTGAATTGTGCTTAGTACATGGGAATACCTATGCTGCCACAGTTTTATGGTATCTCTCTGTACAGACTATGAGCAAACTTAGGGGCTGTTCCTGCTGAAT   4300
4201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4300
4201                                                                                                         4300
4301  TGTGCTTAGTACAGAGGAATACCTATGCTGCCATAGTTTTATGGTATCTCTCTGTACAGGCTATGAACCCTACCTAATGAGCTGTTACTGCTGAATTGTG   4400
4301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4400
4301                                                                                                         4400
4401  CCAAGTACTGTAAAGGGAAATACTTATGCTGTCATAGTTGTCTCAATGTGACAAATGATGGAATTATCTTTGCTATGATAAGTCCCAGAACTGCCGCTGG   4500
4401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4500
4401                                                                                                         4500
4501  GTCTGCTGTATCTAGTGTTATGTCCCTGAACCACACTTGCTTGATTGCCCAGGGAATACTTGGGTACAGATGTCCACTGCTACTCTGTCATTACTGGACA   4600
4501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4600
4501                                                                                                         4600
4601  TGAAATAGATCATTTATTGACAGCGGTGCCGTGTTTTGCACTGAGCTCTCCTGCTATAGCAGCAAGTATCATTTCAGCCCTAGGAGCTGACACTGTTGTC   4700
4601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4700
4601                                                                                                         4700
4701  TCTGTGCTGTAGTTCTCCACTTTTGTCACAGGAAATTAATCCATATCAACATGGTCACCCATCCACTTGTGCCCAGAAAAAGACTGTTAGGCATGGAAAA   4800
4701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4800
4701                                                                                                         4800
4801  CAAGAGTTTTAGATAGAATCTGTGCAGCCACTGGGACAGAATGCTCTGTTATACAGATAGCCATAAAGCAGGACAGGACTGCTGCTTACAATGGGGATCA   4900
4801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4900
4801                                                                                                         4900
4901  GACAGGATTATTGCTCATTTACTACATATCATACATAGTTTCAAGGCACAAATAAAAATCATGTATTCTGATATATCGTCTCTCTGTTGGAGGCATTTCT   5000
4901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5000
4901                                                                                                         5000
5001  ATCAGAGCTGCACAAAGACTTCTTCCCAGCAGACTTCATCCCAAGAGACTTTCCCTCATACAATGAGTATTCAGCTCCCCCTATCGGCAAAACACGAGCA   5100
5001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5100
5001                                                                                                         5100
5101  CAGATAATCTCTCAGGCTCCTCCAAAAGACTAAAGCGCTCACTGTGCGCCCTCTACAGGTTGGAACGGCTGACAACAGGGTCACGTGACTAGAACGCGGC   5200
5101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5200
5101                                                                                                         5200
5201  ACTATTTAATTGCCGGTGCGGATTCCATCACGTGACGAAATAGAATTACACAGCTAGTGCCACATACAAGAATAAGGCGGTCTTTATCGCCGGAGCATTA   5300
5201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5300
5201                                                                                                         5300
5301  CAAGGAGTAAAAACTGCCGAGTTAATTAACAGGAATTCTTTTTGAAGGTATTTCCAGCGTTACTTTTCCTTTAACCATCTATACGTAGGACACACACACA   5400
5301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5400
5301                                                                                                         5400
5401  AACAAGCCCTCCTCCTTTATTCGATCCGTTACTATCCCTTTAAGACAGCATCGATAGGGTTAAGAAGCCCTCGTGGCTGGAATGTTGTGCTGTCACAGGG   5500
5401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5500
5401                                                                                                         5500
5501  GTGGAGGCGGGGATTGGTGACGTCAGAAGGTCCCGCCTCCTCTCGCTGCGGCTCCTGGGTGGGCGGGACTTTGTGCCGGATGATTGACGGCTTCAGTAGG   5600
5501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5600
5501                                                                                                         5600
5601  GAAGAGGGAGGGGCCTAGTCAGACAGAACATTTGGTGTCGGTGCAGCAGTTCGTGTCTCGGGACCAGGGAGCGGCCGTAGGGAGTTCTGCGCTGGGAGCA   5700
5601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5700
5601                                                                                                         5700
5701  ACTAGAGGCTGGTGGAGAGGAGAGGCCTGTGCTTGAGACGAGGAGACCCCCTTTCTGTGTGCGGCCTGACTGGTACGGCTTGTGCCCCGCTCGCTCGTAG   5800
5701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5800
5701                                                                                                         5800
5801  CTTCTCCTGGGGTCGGGGAGGCGCCTCCAGGCTGACAGCGGAGTTTTGGGTGTCCGAGGGGCGGGATAATGACCTCATGGCTTGAGGGGGCTCTTAGGGC   5900
5801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5900
5801                                                                                                         5900
5901  CATGGGAAGCTTCAATCTGGATCTGACCAGACTGGGGGAGGGGCGTTGAAGAATGTGGGCGTGGCTAAATACTGCTGGGTCAGTTTGGGGGGAGGGGCAT   6000
5901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   6000
5901                                                                                                         6000
6001  AGTAAAGTTGATGAGTGTAAGAATAGGGGACAGGAGCAGGGCCGGACAGAGAGAAGGTGATGAGACAATAGAGAAGAGACACGTCCTGGCCAGATGGGCA   6100
6001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   6100
6001                                                                                                         6100
6101  CAGAACCCAAAGTTTGTTTGTGTGTTGGGGGGGGGCACATGGGCTGAGTTTGGGGCAGTTTCTTCTCACTAATCCTAGAAATGGTCTCATGTTATAGTAT   6200
6101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   6200
6101                                                                                                         6200
6201  TGTATGAAAGTTGTCTCCTGCTCATGTCTCCTC   6233
6201  ^---------^---------^---------^   6233
6201                                      6233


In [54]:
print(t[1])


ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 1
Seq('ACAGAATACTAGCTGATGTTCACATACAGGCATCTTTCC', SingleLetterAlphabet())

In [55]:
for item, (j, otherjunk) in enumerate(pamlist):
    print item
    print otherjunk
    print "\n"


0
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AAA', SingleLetterAlphabet())


1
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


2
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


3
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


4
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


5
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


6
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('ATT', SingleLetterAlphabet())


7
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GAA', SingleLetterAlphabet())


8
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GAT', SingleLetterAlphabet())


9
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GTG', SingleLetterAlphabet())


10
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('CGG', SingleLetterAlphabet())


11
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


12
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GAG', SingleLetterAlphabet())


13
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('GAG', SingleLetterAlphabet())


14
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('ATA', SingleLetterAlphabet())


15
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


16
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('AGG', SingleLetterAlphabet())


17
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


18
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TGG', SingleLetterAlphabet())


19
ID: <unknown id>
Name: <unknown name>
Description: <unknown description>
Number of features: 0
Seq('TAG', SingleLetterAlphabet())



In [56]:
print pamlist[14][0][0]


1  TTTTACCCCAACAAAATGACAGTTGCATATTGCAGATTTGATTAACAATGTTACATCCCGTATTATATGGAAGCTACTGCCGTCGTTAATTGAATCCCGA   100
1  ---------^---------^---------^---------^---------^---------^--------<<<<<<<<<<<<<<<<<<<<<<^---------   100
1                                                                     cutsite_rev                         100
101  GTGCATTGTCTTCAGTTGTGTGTCTAGAACATTCCCAACCATGTATTGCTGTAAGGAAGGACTAATGGCTGTCAAGGTGGTGACGCACAATCACACACAA   200
101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   200
101                                                                                                         200
201  GTGTATAGCGAGGATGAGGCCTAAGGCACATGGGGGGTATTCTCAGAAGGCCACATTGTTGCTTCTAGGCTAATAAGATGCCCTTCTGCCTTTAAGATGC   300
201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   300
201                                                                                                         300
301  ATTTACCTGTTGAGTGCCCTTACTTTTTGAGTTGACACCCGTCACTTTGTCACCCACCATGAAATCCGTCCAATAGTATTGCATGTGTGGGGCTAATGGC   400
301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   400
301                                                                                                         400
401  ACTAGATGTTCAAAGGACACATAGTGTTGCTATGGTATAGTTCAGCTGCAGTGCATTTCACTAGGGTGTACATAACATTACAACCAGTATGGCAGCAGCT   500
401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   500
401                                                                                                         500
501  AGTGCTTTTGGCTGTAGGAACCCCCTGTTTCAACCAATCATGCAAACCCCAAGATGTGGCACCTGACAAGTCTTGTGTTGTCTTTAGCCTTGGCAAGATC   600
501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   600
501                                                                                                         600
601  TTTGGGTGATGTTTAATCGAGCCATGGTTAAGTATTCGTGCAGAAGGGCTCAAGGAAATGGCCTTAAAATCCTTCTACTCTTCTTGTCTTTGCAAAACCT   700
601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   700
601                                                                                                         700
701  CTGTCAAATTCAGAAATATATGGAGGTAACTTGCTCAATGGTAACAAGTCAGTAAATGATGACTGAAGCCTTTGCAGCGTTTCCATTAAGCAAGGACTTG   800
701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   800
701                                                                                                         800
801  TAGCAGTCAGTGGTCTCTGCTGATCACTTCCAGTTCATGCTCCACCTAGTGGCAGAATGCAGAAGTACGACACATTGATTTGTTATATGTTATGGGTTCT   900
801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   900
801                                                                                                         900
901  AATCAGAGGACACACAAGTGACTGTGGGACTAGTAGTCTTATTGGAGATGGGGGAGAATAAAGGGAGAATGGGACAGTTATTTATAATAAACACGATCAA   1000
901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1000
901                                                                                                         1000
1001  CGTGACATTTTGCCTGGGGTAAATTAAGAAAGTTTGAGATTGGCTGATTTATGTTGCACTGACTGGTAGTTAATTGTTCACTTTCTCTTCTCCGTGTGAT   1100
1001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1100
1001                                                                                                         1100
1101  TTACCCTTTAATCTTTTTTGTTTGGCCTGTGTTGGTTACTCCTTTCCATTTGCTTCTTTACCAAACCAGGCCGAGAGCGCAGGCCTTTTAGGAATCAACC   1200
1101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1200
1101                                                                                                         1200
1201  TTTGTGTGTCCCTTTATGTTGTTCCACTATAATTATATATATCAACCCCTGTCGGTTGCAATGCAGTGTGTGTGCTCCACTCACCCGAGGTGGGGGATGT   1300
1201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1300
1201                                                                                                         1300
1301  AGAAAAGTAGCAGTTTCTTAAAGGTTTTTAATATCTCAATCTGCTCGATGCATGGATTTGTACATATATATATTAATGTTATCTTTATTTGATTTTTCTG   1400
1301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1400
1301                                                                                                         1400
1401  CTTGGGAAAGAAATGAATGAGCCAAGGTCGATAAATCGGCAGCTCCGGGGGTTGCACTACAGCTACTGGGGCTGAAAATTAAAAGTCTCTGGCTCAATTA   1500
1401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1500
1401                                                                                                         1500
1501  TAGGCAAGTCAGGCTTTTCCATTTGCAGCTGTTGCCAGGGTTGCCAGGTCTAATTTTGAAAAACAGCCAAAGTCGGCTACAAAACTAGCCAAGAGGCACT   1600
1501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1600
1501                                                                                                         1600
1601  TAAAAGTAGCCCAAAAATAGCGCAATATGTGCAGTGAAAAAAATTCTAAAGAAATGAATGAATTTATGTGAAAAAATGCCTTTTTGAATTGTTCACTGTT   1700
1601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1700
1601                                                                                                         1700
1701  TAACAAATGTTCCCATGAATTCGCCCGTCACCGGGGATGTAACAACAGAAGGGGCCCAGGAGATATAGGGGCCCCGTAATACATATACAATTTCAATAAA   1800
1701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1800
1701                                                                                                         1800
1801  TATTGGTGAAAAAGCTCAACCTCTAGACATTTTGGTGGCCAGCAGATTTTTGCTGAAATTGAGGAAAGTCACCGGAATGGGAGACGGGGGGGACAGAGCC   1900
1801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   1900
1801                                                                                                         1900
1901  CAAAATCTGATAACTCCCATCTTCTACACAAGTCAGTCCCATTGCATGCTGGGTATTGTAGTCTTACATTAAACTTGGCAGTTGGCAAATTGTTAAACAA   2000
1901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2000
1901                                                                                                         2000
2001  AAACTACATTACCCAGCATGCATTGGGAGACGCAGGCTTGGCTCATTAGGCAAGAAAAAACTGTGGCTGGTTTCCAAATTACAAACCAGCTAAAGGCCCA   2100
2001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2100
2001                                                                                                         2100
2101  AAAAGTAGCCCAAATCCTTACCTTGGCTAGTTTGTACTTTTAAAACCTGCCTGGGCTTTAAATTAGTAGCCCAGTTTGGCTGGAAACCTGCCGACCGTGG   2200
2101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2200
2101                                                                                                         2200
2201  CAACCCTGGCCGCTGCTTCAGATACCGTCGGACGCGGCTACTTGTATGATTTTGTATAAACGCATCGGAACAGGGCGGAGCTACAAAGAATAATGTAAAT   2300
2201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2300
2201                                                                                                         2300
2301  ACGTCACAGACACAAAGGAGAAAAGCACTTTCGGAATCCTCGGAACGCAGACTGTCGTCCCTCCTGGTCATCGTTCTGCTCCTTGTTCTTCTGACTTCTC   2400
2301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2400
2301                                                                                                         2400
2401  TGTCTCATTGCCTTCAAGCTGTCGGGGGACGCAGCACAAGTGCAATTGAATCTTTCAATAATGTATTTAACGGTAATGAAACCTCCAAGTCTTGGGCTCT   2500
2401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2500
2401                                                                                                         2500
2501  TTCCTCTTTTGTTTTCATTTCTGTTTGATACGTGTTGTGGTGTCGCCGAGGGGCGGCCGCGTCTCACGCTGTACATTGTTCAGCTCTTACAGAAATGACC   2600
2501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2600
2501                                                                                                         2600
2601  TTTGTCCTTATTAAAACAGATTTCTCTCCTTTTATCAAATCATTTCAATGTGTCGTTGGTCTGTCTGTGAGCTCGGTAATTTAATAAGGGCAACGGAATA   2700
2601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2700
2601                                                                                                         2700
2701  CTTGCTGGCAATTGTCGCCAGTTACATCGGGCCCCATACAACAACATTTTCTGGGCTCCTCCCAGGCCAGACCCCACCTACCCCATTGGTAACTATGGCC   2800
2701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2800
2701                                                                                                         2800
2801  CCCTACTACAAGTTAAAAACATCATTGGTGGTCAAGCCCCCCAATTCAAGTAAAAAAATAATCATAGGTGACCAGTGCCCCCCCCTACAAGTTAAAAAAA   2900
2801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   2900
2801                                                                                                         2900
2901  AAAAATGTTGGTGCCCAGGGTTCCCCCTACAAGTTAAAAAAACAGGAAAAGCAACTGTAACTTGATTTATTTTATTGTAAAACAATTAGAATTAAAAAAA   3000
2901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3000
2901                                                                                                         3000
3001  TATTGGAGACCAGATTTCTGTCCTACAAGTTAAAAAACAAAAATATTGGTTGGAAAACAGGAACAGCAACTGTAACTAGATTTCTTTTATTGTAAAACAA   3100
3001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3100
3001                                                                                                         3100
3101  TAAAAAATATTGGTGACCAGGGTTCCCCCTACAAGTTAAAAAAAAATTATTGGTGGCCTTTCTTATGCACAGATTGCCAGCTATGACATCACAGGTGGGA   3200
3101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3200
3101                                                                                                         3200
3201  TTGACATGACAGAGCCATCTTGGTAGAGAAAATCTGTTTAGGTGGATCTGGGTTTCTCAACAATTCCTTGGTGAATAGCAGAGGGGCCAGACTATTCTGC   3300
3201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3300
3201                                                                                                         3300
3301  TTAGTCTCTAGACAGCAGGACCACATCCTGGGCATCTCAATCGAAGGCCGTCTCTTTTCCTTGACACTCGTCCTTCTCTCAGGTGGTCTCCCAAGATGAC   3400
3301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3400
3301                                                                                                         3400
3401  CAGAGCTGCAGACAGTGTTCTTAGCCAAGTGCTATTGGTGGTCTCACCATCTGGCTTGGGCCGGTTTGAGTTCAATGCAAGAATGTCACAGCAGATCTTA   3500
3401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3500
3401                                                                                                         3500
3501  TTATGGGTAGAACACAACAAGGCAATCCTGTCTACTATTCACATTCCAGGGGTTTCCAACACAGAGGTAGACTTGACAGTTGCAATCAGCAAGACCTTGG   3600
3501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3600
3501                                                                                                         3600
3601  AGATGAGGAGCTTCATCAGAATGTATTTGGTGGGGCTGACCAAAATCCTTACCCCACCATGGTCAATACAATGATTCAAGCCTGGCCATTTTGTCTGGAC   3700
3601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3700
3601                                                                                                         3700
3701  TACATATTACCTTCACTCCCATTGCTGCTCAAAGTTCTAAAGAATGTCCAAAATTGACATAGAGCATTCTGGCCTTGGTTTATTTTTATGGAAAGGCTTT   3800
3701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3800
3701                                                                                                         3800
3801  TGAAGCTGCTATTTGGAGAAGAAATGGAATAACCGAGGAGGTAATAACTATGTGAAAAGCCTGAAACCCGCTGCATATGAAGATGCCTACCATAGAGTTT   3900
3801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   3900
3801                                                                                                         3900
3901  GGGGCTGGTGCCAGTAATACAATGTTTCTACAGCGAGGTTTAGAAGTCGGCTCTAGAAAGGTACAGGTGTTGCACTGTCAGTGCTTTTTCAATCTCCCAT   4000
3901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4000
3901                                                                                                         4000
4001  TACCTGGACTGATTGAACTCCTTTTAAATCAGACACCTGAGCCTATGATTAAGTGTTTGAAAGATACGAAACGTGTACCGAAATAAACTACTAATTATTT   4100
4001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4100
4001                                                                                                         4100
4101  TACAAGTCTGTCTGGAAGAATGGCCTTTGGAAGTGCCTGCTCTTTTTCAATAAAGTTATCTGCTCCATCAGCTGAAGGTTTAGGGCGGTGCACCTGGACC   4200
4101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4200
4101                                                                                                         4200
4201  TCTTCTTGCAACTAGTAAGTTTTTACATTTTGAGACCTGAGAACTTCAATATACCTGTTATCAGCTTGTGTTCATTATATATTTTGAGATCCTCTCCCCG   4300
4201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4300
4201                                                                                                         4300
4301  ATATGTCCACCTTCAGGTGGGCAATATCATGCTGATCCGATCGTGGGCCCTAGGGCACAAAGAGAAGAATACGGGGGGTCAGATCGATGACCGCATTAAA   4400
4301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4400
4301                                                                                                         4400
4401  AAAACCAATGCCATCCTCCATCCCATGGGATTTTTAAACCCGCCCGACCGACATCTGGCCAACTTTAGGCCAGAATTGATAGGGGAATCTGTCGGCAGCT   4500
4401  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4500
4401                                                                                                         4500
4501  TATATCTGCCCCTGTATGGGGTCCTTTAAAAAACAGATCAGTACAATTGCACCCCTCTGGTTATTGGATCGTGTACACATGTAAGGGTTCTGTATGTATT   4600
4501  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4600
4501                                                                                                         4600
4601  AAATACAATGTACCGTTACCCATTCTCTAGTGAAGACCGTGTCCTCACGTCAGAAGCTCTAATATTGTTTAGCCATGGGAGCTTCCACTGAGAGTTCCAG   4700
4601  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4700
4601                                                                                                         4700
4701  CAAGGCCCAAATATTACTTTACCTTTAATATAGAATGAGCATATAAATCAGTTCTGAGCCCTGTTTGGCCACTCTACTATGGTCATTTCTTCAGCTGTAA   4800
4701  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4800
4701                                                                                                         4800
4801  GCAAAAGAATTTCCAGACTCTATTATACAGTATTGTGGATAGAAAACTTCACTAGACATAATTTAGGATCCGACTTGGAGATGAGATTCTAAAAGGCACA   4900
4801  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   4900
4801                                                                                                         4900
4901  GGAAAGCAATTAAAGTGTATATAACGGCTGAGCTCAGGCCCGAACTGGCAATCTGTGGGTTCTGGCAAATGCCACATGGGCTGCTGTAAGTTGCCATAGA   5000
4901  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5000
4901                                                                                                         5000
5001  CAGTCACTATTTATTGGGCTGGTGGAGGGCTGCTTGGGCCTCTGTGTACTTGGAATGCTATTGTCTATTGTAAATCCCAGTCCAGACCTGGCTAAGCTCA   5100
5001  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5100
5001                                                                                                         5100
5101  TGCAAATGCTGATACACAGTGATGTCATATTTCCCTTAGTGAGTCTGTGACATCATCAGGTATCAAGGCAGCAATCATACATGGACTTAGTTATACCAGG   5200
5101  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5200
5101                                                                                                         5200
5201  ACCCTATAAAGCTAAGTGGTCAGTCCAGGGGTCAAAGACATGGGACAGCCTCCAAGCTTGATATCCAAAAATATAAGCAATGTTGTTGAAACGCTAACAC   5300
5201  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5300
5201                                                                                                         5300
5301  CCAGACTTTGCTGGACTACAACTCCCAGACTTCGTAACTGCTCAATTTCATGTTGCAGAATGCTGGGAGATTCCAGACAACCTCAAATTCCAGTCTCCTA   5400
5301  ^---------^---------^---------^---------^---------^---------^---------^---------^---------^---------   5400
5301                                                                                                         5400
5401  TGGTTTATCAGTACAACAGAA   5421
5401  ^---------^---------^   5421
5401                          5421

In this analysis, 6 items are not next to NAG/NGG. But 3 are probably leftover fragment end dupes, leaving 14/17 correct, or an 82% yield.

1 is in a primer (989347). 

6 (141439)

7 (not in primer...), 

8 (not in primer),

9 (not in primer)

14. (is in 4995367)

In [56]:


In [ ]: